ustrings.inc 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2005 by Florian Klaempfl,
  4. Copyright (c) 2011 by Jonas Maebe,
  5. members of the Free Pascal development team.
  6. This file implements support routines for UTF-8 strings with FPC
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$i wustrings.inc}
  14. {
  15. This file contains the implementation of the UnicodeString type,
  16. which on the Java platforms is an alias for java.lang.String
  17. }
  18. Function NewUnicodeString(Len : SizeInt) : JLString;
  19. {
  20. Allocate a new UnicodeString on the heap.
  21. initialize it to zero length and reference count 1.
  22. }
  23. var
  24. data: array of jchar;
  25. begin
  26. setlength(data,len);
  27. result:=JLString.create(data);
  28. end;
  29. procedure fpc_UnicodeStr_To_ShortStr (out res: ShortString;const S2 : UnicodeString); [Public, alias: 'FPC_UNICODESTR_TO_SHORTSTR'];compilerproc;
  30. {
  31. Converts a UnicodeString to a ShortString;
  32. }
  33. Var
  34. Size : SizeInt;
  35. temp : ansistring;
  36. begin
  37. res:='';
  38. Size:=Length(S2);
  39. if Size>0 then
  40. begin
  41. temp:=s2;
  42. res:=temp;
  43. end;
  44. end;
  45. Function fpc_ShortStr_To_UnicodeStr (Const S2 : ShortString): UnicodeString;compilerproc;
  46. {
  47. Converts a ShortString to a UnicodeString;
  48. }
  49. Var
  50. Size : SizeInt;
  51. begin
  52. result:='';
  53. Size:=Length(S2);
  54. if Size>0 then
  55. result:=unicodestring(JLString.Create(TJByteArray(ShortstringClass(@S2).fdata),0,length(S2)));
  56. end;
  57. Function fpc_UnicodeStr_To_AnsiStr (const S2 : UnicodeString): AnsiString; compilerproc;
  58. {
  59. Converts a UnicodeString to an AnsiString
  60. }
  61. Var
  62. Size : SizeInt;
  63. begin
  64. result:=Ansistring(AnsistringClass.Create(s2));
  65. end;
  66. Function fpc_AnsiStr_To_UnicodeStr (Const S2 : AnsiString): UnicodeString; compilerproc;
  67. {
  68. Converts an AnsiString to a UnicodeString;
  69. }
  70. Var
  71. Size : SizeInt;
  72. begin
  73. if length(s2)=0 then
  74. result:=''
  75. else
  76. result:=AnsistringClass(S2).toString;
  77. end;
  78. Function fpc_UnicodeStr_To_WideStr (const S2 : UnicodeString): WideString; compilerproc;
  79. begin
  80. result:=s2;
  81. end;
  82. Function fpc_WideStr_To_UnicodeStr (Const S2 : WideString): UnicodeString; compilerproc;
  83. begin
  84. result:=s2;
  85. end;
  86. function fpc_UnicodeStr_Concat (const S1,S2 : UnicodeString): UnicodeString; compilerproc;
  87. Var
  88. sb: JLStringBuilder;
  89. begin
  90. { only assign if s1 or s2 is empty }
  91. if (length(S1)=0) then
  92. begin
  93. result:=s2;
  94. exit;
  95. end;
  96. if (length(S2)=0) then
  97. begin
  98. result:=s1;
  99. exit;
  100. end;
  101. sb:=JLStringBuilder.create(S1);
  102. sb.append(s2);
  103. result:=sb.toString;
  104. end;
  105. function fpc_UnicodeStr_Concat_multi (const sarr:array of Unicodestring): unicodestring; compilerproc;
  106. Var
  107. i : Longint;
  108. Size,NewSize : SizeInt;
  109. sb: JLStringBuilder;
  110. begin
  111. { First calculate size of the result so we can allocate a StringBuilder of
  112. the right size }
  113. NewSize:=0;
  114. for i:=low(sarr) to high(sarr) do
  115. inc(Newsize,length(sarr[i]));
  116. sb:=JLStringBuilder.create(NewSize);
  117. for i:=low(sarr) to high(sarr) do
  118. begin
  119. if length(sarr[i])>0 then
  120. sb.append(sarr[i]);
  121. end;
  122. result:=sb.toString;
  123. end;
  124. Function fpc_Char_To_UChar(const c : AnsiChar): UnicodeChar; compilerproc;
  125. var
  126. str: JLString;
  127. arr: array of jbyte;
  128. begin
  129. setlength(arr,1);
  130. arr[0]:=ord(c);
  131. result:=JLString.create(arr,0,1).charAt(0);
  132. end;
  133. Function fpc_Char_To_UnicodeStr(const c : AnsiChar): UnicodeString; compilerproc;
  134. {
  135. Converts a AnsiChar to a UnicodeString;
  136. }
  137. var
  138. str: JLString;
  139. arr: array of jbyte;
  140. begin
  141. setlength(arr,1);
  142. arr[0]:=ord(c);
  143. result:=JLString.create(arr,0,1);
  144. end;
  145. Function fpc_UChar_To_Char(const c : UnicodeChar): AnsiChar; compilerproc;
  146. {
  147. Converts a UnicodeChar to a AnsiChar;
  148. }
  149. var
  150. arrb: array of jbyte;
  151. arrw: array of jchar;
  152. str: JLString;
  153. begin
  154. setlength(arrw,1);
  155. arrw[0]:=c;
  156. str:=JLString.create(arrw);
  157. arrb:=str.getbytes();
  158. result:=chr(arrb[0]);
  159. end;
  160. Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
  161. {
  162. Converts a WideChar to a UnicodeString;
  163. }
  164. var
  165. arrw: array of jchar;
  166. begin
  167. setlength(arrw,1);
  168. arrw[0]:=c;
  169. result:=JLString.create(arrw);
  170. end;
  171. Function fpc_Char_To_WChar(const c : AnsiChar): WideChar; compilerproc;
  172. {
  173. Converts a AnsiChar to a WideChar;
  174. }
  175. var
  176. str: JLString;
  177. arr: array of jbyte;
  178. begin
  179. setlength(arr,1);
  180. arr[0]:=ord(c);
  181. result:=JLString.create(arr,0,1).charAt(0);
  182. end;
  183. Function fpc_WChar_To_Char(const c : WideChar): AnsiChar; compilerproc;
  184. {
  185. Converts a WideChar to a AnsiChar;
  186. }
  187. var
  188. arrb: array of jbyte;
  189. arrw: array of jchar;
  190. begin
  191. setlength(arrw,1);
  192. arrw[0]:=c;
  193. arrb:=JLString.create(arrw).getbytes();
  194. result:=chr(arrb[0]);
  195. end;
  196. procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
  197. {
  198. Converts a WideChar to a ShortString;
  199. }
  200. var
  201. u: unicodestring;
  202. begin
  203. u:=c;
  204. res:=u;
  205. end;
  206. Function fpc_UChar_To_UnicodeStr(const c : UnicodeChar): UnicodeString; compilerproc;
  207. {
  208. Converts a UnicodeChar to a UnicodeString;
  209. }
  210. var
  211. arr: array[0..0] of UnicodeChar;
  212. begin
  213. arr[0]:=c;
  214. result:=JLString.create(arr);
  215. end;
  216. Function fpc_UChar_To_AnsiStr(const c : UnicodeChar): AnsiString; compilerproc;
  217. {
  218. Converts a UnicodeChar to a AnsiString;
  219. }
  220. var
  221. u: unicodestring;
  222. begin
  223. u:=c;
  224. result:=u;
  225. end;
  226. (*
  227. Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
  228. Var
  229. L : SizeInt;
  230. begin
  231. if (not assigned(p)) or (p[0]=#0) Then
  232. begin
  233. fpc_pchar_to_unicodestr := '';
  234. exit;
  235. end;
  236. l:=IndexChar(p^,-1,#0);
  237. widestringmanager.Ansi2UnicodeMoveProc(P,fpc_PChar_To_UnicodeStr,l);
  238. end;
  239. *)
  240. Function fpc_CharArray_To_UnicodeStr(const arr: array of ansichar; zerobased: boolean = true): UnicodeString; compilerproc;
  241. var
  242. i,j : SizeInt;
  243. localarr: array of jbyte;
  244. foundnull: boolean;
  245. begin
  246. if (zerobased) then
  247. begin
  248. if (arr[0]=#0) Then
  249. begin
  250. fpc_chararray_to_unicodestr := '';
  251. exit;
  252. end;
  253. foundnull:=false;
  254. for i:=low(arr) to high(arr) do
  255. if arr[i]=#0 then
  256. begin
  257. foundnull:=true;
  258. break;
  259. end;
  260. if not foundnull then
  261. i := high(arr)+1;
  262. end
  263. else
  264. i := high(arr)+1;
  265. setlength(localarr,i);
  266. for j:=0 to i-1 do
  267. localarr[j]:=ord(arr[j]);
  268. result:=JLString.create(localarr,0,i);
  269. end;
  270. (*
  271. function fpc_UnicodeCharArray_To_ShortStr(const arr: array of unicodechar; zerobased: boolean = true): shortstring;[public,alias:'FPC_UNICODECHARARRAY_TO_SHORTSTR']; compilerproc;
  272. var
  273. l: longint;
  274. index: longint;
  275. len: byte;
  276. temp: ansistring;
  277. foundnull: boolean;
  278. begin
  279. l := high(arr)+1;
  280. if l>=256 then
  281. l:=255
  282. else if l<0 then
  283. l:=0;
  284. if zerobased then
  285. begin
  286. foundnull:=false;
  287. for index:=low(arr) to l-1 do
  288. if arr[index]=#0 then
  289. begin
  290. foundnull:=true;
  291. break;
  292. end;
  293. if not foundnull then
  294. len := l
  295. else
  296. len := index;
  297. end
  298. else
  299. len := l;
  300. result:=JLString.create(arr,0,l);
  301. end;
  302. Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; zerobased: boolean = true): AnsiString; compilerproc;
  303. var
  304. i : SizeInt;
  305. begin
  306. if (zerobased) then
  307. begin
  308. i:=IndexWord(arr,high(arr)+1,0);
  309. if i = -1 then
  310. i := high(arr)+1;
  311. end
  312. else
  313. i := high(arr)+1;
  314. SetLength(fpc_UnicodeCharArray_To_AnsiStr,i);
  315. widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),fpc_UnicodeCharArray_To_AnsiStr,i);
  316. end;
  317. *)
  318. Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
  319. var
  320. i : SizeInt;
  321. foundnull : boolean;
  322. begin
  323. if (zerobased) then
  324. begin
  325. foundnull:=false;
  326. for i:=low(arr) to high(arr) do
  327. if arr[i]=#0 then
  328. begin
  329. foundnull:=true;
  330. break;
  331. end;
  332. if not foundnull then
  333. i := high(arr)+1;
  334. end
  335. else
  336. i := high(arr)+1;
  337. result:=JLString.create(arr,0,i);
  338. end;
  339. Function real_widechararray_to_unicodestr(const arr: array of widechar; zerobased: boolean): Unicodestring;
  340. var
  341. i : SizeInt;
  342. foundnull : boolean;
  343. begin
  344. if (zerobased) then
  345. begin
  346. foundnull:=false;
  347. for i:=low(arr) to high(arr) do
  348. if arr[i]=#0 then
  349. begin
  350. foundnull:=true;
  351. break;
  352. end;
  353. if not foundnull then
  354. i := high(arr)+1;
  355. end
  356. else
  357. i := high(arr)+1;
  358. result:=JLString.create(arr,0,i);
  359. end;
  360. Function fpc_WideCharArray_To_UnicodeStr(const arr: array of widechar; zerobased: boolean = true): UnicodeString; compilerproc;
  361. begin
  362. result:=real_widechararray_to_unicodestr(arr,zerobased);
  363. end;
  364. { due to their names, the following procedures should be in wstrings.inc,
  365. however, the compiler generates code using this functions on all platforms }
  366. procedure fpc_WideCharArray_To_ShortStr(out res : shortstring;const arr: array of widechar; zerobased: boolean = true);[public,alias:'FPC_WIDECHARARRAY_TO_SHORTSTR']; compilerproc;
  367. begin
  368. res:=real_widechararray_to_unicodestr(arr,zerobased);
  369. end;
  370. Function fpc_WideCharArray_To_AnsiStr(const arr: array of widechar; zerobased: boolean = true): AnsiString; compilerproc;
  371. begin
  372. result:=real_widechararray_to_unicodestr(arr,zerobased);
  373. end;
  374. procedure fpc_unicodestr_to_chararray(out res: array of AnsiChar; const src: UnicodeString); compilerproc;
  375. var
  376. i, len: SizeInt;
  377. temp: array of jbyte;
  378. begin
  379. len := length(src);
  380. { make sure we don't dereference src if it can be nil (JM) }
  381. if len > 0 then
  382. begin
  383. temp:=JLString(src).getBytes;
  384. if len > length(temp) then
  385. len := length(temp);
  386. for i := 0 to len-1 do
  387. res[i] := chr(temp[i]);
  388. end;
  389. end;
  390. procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
  391. var
  392. len: SizeInt;
  393. begin
  394. len := length(src);
  395. { make sure we don't dereference src if it can be nil (JM) }
  396. if len > 0 then
  397. begin
  398. if len > high(res)+1 then
  399. len := high(res)+1;
  400. JLString(src).getChars(0,len,res,0);
  401. end;
  402. end;
  403. function fpc_unicodestr_setchar(const s: UnicodeString; const index: longint; const ch: unicodechar): UnicodeString; compilerproc;
  404. var
  405. sb: JLStringBuilder;
  406. begin
  407. sb:=JLStringBuilder.create(s);
  408. { string indexes are 1-based in Pascal, 0-based in Java }
  409. sb.setCharAt(index-1,ch);
  410. result:=sb.toString();
  411. end;
  412. procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
  413. var
  414. len: SizeInt;
  415. temp: unicodestring;
  416. begin
  417. len := length(src);
  418. { make sure we don't dereference src if it can be nil (JM) }
  419. if len > 0 then
  420. temp:=src;
  421. len := length(temp);
  422. if len > length(res) then
  423. len := length(res);
  424. JLString(temp).getChars(0,len,res,0);
  425. JUArrays.fill(res,len,high(res),#0);
  426. end;
  427. (*
  428. procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
  429. var
  430. len: longint;
  431. temp : unicodestring;
  432. begin
  433. len := length(src);
  434. { temp is initialized with an empty string, so no need to convert src in case
  435. it's also empty}
  436. if len > 0 then
  437. temp:=src;
  438. len := length(temp);
  439. if len > high(res)+1 then
  440. len := high(res)+1;
  441. JLString(temp).getChars(0,len,res,0);
  442. JUArrays.fill(res,len,high(res),#0);
  443. end;
  444. *)
  445. procedure fpc_ansistr_to_widechararray(out res: array of widechar; const src: AnsiString); compilerproc;
  446. var
  447. len: SizeInt;
  448. temp: widestring;
  449. begin
  450. len := length(src);
  451. { make sure we don't dereference src if it can be nil (JM) }
  452. if len > 0 then
  453. temp:=src;
  454. len := length(temp);
  455. if len > high(res)+1 then
  456. len := high(res)+1;
  457. JLString(temp).getChars(0,len,res,0);
  458. JUArrays.fill(res,len,high(res),#0);
  459. end;
  460. procedure fpc_shortstr_to_widechararray(out res: array of widechar; const src: ShortString); compilerproc;
  461. var
  462. len: longint;
  463. temp : unicodestring;
  464. begin
  465. len := length(src);
  466. { temp is initialized with an empty string, so no need to convert src in case
  467. it's also empty}
  468. if len > 0 then
  469. temp:=src;
  470. len := length(temp);
  471. if len > high(res)+1 then
  472. len := high(res)+1;
  473. JLString(temp).getChars(0,len,res,0);
  474. JUArrays.fill(res,len,high(res),#0);
  475. end;
  476. procedure fpc_unicodestr_to_widechararray(out res: array of widechar; const src: UnicodeString); compilerproc;
  477. var
  478. i, len: SizeInt;
  479. begin
  480. len := length(src);
  481. if len > length(res) then
  482. len := length(res);
  483. JLString(src).getChars(0,len,res,0);
  484. end;
  485. Function fpc_UnicodeStr_Compare(const S1,S2 : UnicodeString): SizeInt; compilerproc;
  486. {
  487. Compares 2 UnicodeStrings;
  488. The result is
  489. <0 if S1<S2
  490. 0 if S1=S2
  491. >0 if S1>S2
  492. }
  493. Var
  494. MaxI,Temp : SizeInt;
  495. begin
  496. if JLObject(S1)=JLObject(S2) then
  497. begin
  498. result:=0;
  499. exit;
  500. end;
  501. result:=JLString(S1).compareTo(S2);
  502. end;
  503. Function fpc_UnicodeStr_Compare_Equal(const S1,S2 : UnicodeString): SizeInt; compilerproc;
  504. {
  505. Compares 2 UnicodeStrings for equality only;
  506. The result is
  507. 0 if S1=S2
  508. <>0 if S1<>S2
  509. }
  510. Var
  511. MaxI : SizeInt;
  512. begin
  513. result:=ord(not JLString(S1).equals(JLString(S2)));
  514. end;
  515. function fpc_UnicodeStr_SetLength(const S : UnicodeString; l : SizeInt): UnicodeString; compilerproc;
  516. {
  517. Sets The length of string S to L.
  518. Makes sure S is unique, and contains enough room.
  519. Returns new val
  520. }
  521. Var
  522. movelen: SizeInt;
  523. chars: array of widechar;
  524. strlen: SizeInt;
  525. begin
  526. if (l>0) then
  527. begin
  528. if JLObject(S)=nil then
  529. begin
  530. { Need a completely new string...}
  531. result:=NewUnicodeString(l);
  532. end
  533. { no need to create a new string, since Java strings are immutable }
  534. else
  535. begin
  536. strlen:=length(s);
  537. if l=strlen then
  538. result:=s
  539. else if (l<strlen) then
  540. result:=JLString(s).substring(0,l)
  541. else
  542. begin
  543. setlength(chars,l);
  544. JLString(s).getChars(0,strlen,chars,0);
  545. result:=JLString.create(chars,0,l)
  546. end;
  547. end
  548. end
  549. else
  550. begin
  551. result:='';
  552. end;
  553. end;
  554. {*****************************************************************************
  555. Public functions, In interface.
  556. *****************************************************************************}
  557. (*
  558. function UnicodeCharToString(S : PUnicodeChar) : AnsiString;
  559. begin
  560. result:=UnicodeCharLenToString(s,Length(UnicodeString(s)));
  561. end;
  562. function StringToUnicodeChar(const Src : AnsiString;Dest : PUnicodeChar;DestSize : SizeInt) : PUnicodeChar;
  563. var
  564. temp:unicodestring;
  565. begin
  566. widestringmanager.Ansi2UnicodeMoveProc(PChar(Src),temp,Length(Src));
  567. if Length(temp)<DestSize then
  568. move(temp[1],Dest^,Length(temp)*SizeOf(UnicodeChar))
  569. else
  570. move(temp[1],Dest^,(DestSize-1)*SizeOf(UnicodeChar));
  571. Dest[DestSize-1]:=#0;
  572. result:=Dest;
  573. end;
  574. function WideCharToString(S : PWideChar) : AnsiString;
  575. begin
  576. result:=WideCharLenToString(s,Length(WideString(s)));
  577. end;
  578. function StringToWideChar(const Src : AnsiString;Dest : PWideChar;DestSize : SizeInt) : PWideChar;
  579. var
  580. temp:widestring;
  581. begin
  582. widestringmanager.Ansi2WideMoveProc(PChar(Src),temp,Length(Src));
  583. if Length(temp)<DestSize then
  584. move(temp[1],Dest^,Length(temp)*SizeOf(WideChar))
  585. else
  586. move(temp[1],Dest^,(DestSize-1)*SizeOf(WideChar));
  587. Dest[DestSize-1]:=#0;
  588. result:=Dest;
  589. end;
  590. function UnicodeCharLenToString(S : PUnicodeChar;Len : SizeInt) : AnsiString;
  591. begin
  592. //SetLength(result,Len);
  593. widestringmanager.Unicode2AnsiMoveproc(S,result,Len);
  594. end;
  595. procedure UnicodeCharLenToStrVar(Src : PUnicodeChar;Len : SizeInt;out Dest : AnsiString);
  596. begin
  597. Dest:=UnicodeCharLenToString(Src,Len);
  598. end;
  599. procedure UnicodeCharToStrVar(S : PUnicodeChar;out Dest : AnsiString);
  600. begin
  601. Dest:=UnicodeCharToString(S);
  602. end;
  603. function WideCharLenToString(S : PWideChar;Len : SizeInt) : AnsiString;
  604. begin
  605. //SetLength(result,Len);
  606. widestringmanager.Wide2AnsiMoveproc(S,result,Len);
  607. end;
  608. procedure WideCharLenToStrVar(Src : PWideChar;Len : SizeInt;out Dest : AnsiString);
  609. begin
  610. Dest:=WideCharLenToString(Src,Len);
  611. end;
  612. procedure WideCharToStrVar(S : PWideChar;out Dest : AnsiString);
  613. begin
  614. Dest:=WideCharToString(S);
  615. end;
  616. *)
  617. Function fpc_unicodestr_Unique(const S : JLObject): JLObject; compilerproc;
  618. begin
  619. result:=s;
  620. end;
  621. Function Fpc_UnicodeStr_Copy (Const S : UnicodeString; Index,Size : SizeInt) : UnicodeString;compilerproc;
  622. begin
  623. dec(index);
  624. if Index < 0 then
  625. Index := 0;
  626. { Check Size. Accounts for Zero-length S, the double check is needed because
  627. Size can be maxint and will get <0 when adding index }
  628. if (Size>Length(S)) or
  629. (Index+Size>Length(S)) then
  630. Size:=Length(S)-Index;
  631. If Size>0 then
  632. result:=JLString(s).subString(Index,Size)
  633. else
  634. result:='';
  635. end;
  636. Function Pos (Const Substr : UnicodeString; Const Source : UnicodeString) : SizeInt;
  637. begin
  638. Pos:=0;
  639. if Length(SubStr)>0 then
  640. Pos:=JLString(Source).indexOf(SubStr)+1;
  641. end;
  642. { Faster version for a unicodechar alone }
  643. Function Pos (c : UnicodeChar; Const s : UnicodeString) : SizeInt;
  644. begin
  645. Pos:=0;
  646. if length(S)>0 then
  647. Pos:=JLString(s).indexOf(ord(c))+1;
  648. end;
  649. (*
  650. Function Pos (c : AnsiString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  651. begin
  652. result:=Pos(UnicodeString(c),s);
  653. end;
  654. Function Pos (c : ShortString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  655. begin
  656. result:=Pos(UnicodeString(c),s);
  657. end;
  658. Function Pos (c : UnicodeString; Const s : AnsiString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  659. begin
  660. result:=Pos(c,UnicodeString(s));
  661. end;
  662. *)
  663. { Faster version for a char alone. Must be implemented because }
  664. { pos(c: char; const s: shortstring) also exists, so otherwise }
  665. { using pos(char,pchar) will always call the shortstring version }
  666. { (exact match for first argument), also with $h+ (JM) }
  667. Function Pos (c : AnsiChar; Const s : UnicodeString) : SizeInt;
  668. var
  669. i: SizeInt;
  670. wc : unicodechar;
  671. begin
  672. wc:=c;
  673. result:=Pos(wc,s);
  674. end;
  675. (*
  676. Procedure Delete (Var S : UnicodeString; Index,Size: SizeInt);
  677. Var
  678. LS : SizeInt;
  679. sb: JLStringBuilder;
  680. begin
  681. LS:=Length(S);
  682. if (Index>LS) or (Index<=0) or (Size<=0) then
  683. exit;
  684. { (Size+Index) will overflow if Size=MaxInt. }
  685. if Size>LS-Index then
  686. Size:=LS-Index+1;
  687. if Size<=LS-Index then
  688. begin
  689. Dec(Index);
  690. sb:=JLStringBuilder.Create(s);
  691. sb.delete(index,size);
  692. s:=sb.toString;
  693. end
  694. else
  695. s:=JLString(s).substring(0,index-1);
  696. end;
  697. Procedure Insert (Const Source : UnicodeString; Var S : UnicodeString; Index : SizeInt);
  698. var
  699. Temp : UnicodeString;
  700. LS : SizeInt;
  701. sb : JLStringBuilder;
  702. begin
  703. If Length(Source)=0 then
  704. exit;
  705. if index <= 0 then
  706. index := 1;
  707. Ls:=Length(S);
  708. if index > LS then
  709. index := LS+1;
  710. Dec(Index);
  711. sb:=JLStringBuilder.Create(S);
  712. sb.insert(Index,Source);
  713. S:=sb.toString;
  714. end;
  715. *)
  716. Function UpCase(c:UnicodeChar):UnicodeChar;
  717. begin
  718. result:=JLCharacter.toUpperCase(c);
  719. end;
  720. function UpCase(const s : UnicodeString) : UnicodeString;
  721. begin
  722. result:=JLString(s).toUpperCase;
  723. end;
  724. (*
  725. Procedure SetString (Out S : UnicodeString; Buf : PUnicodeChar; Len : SizeInt);
  726. begin
  727. SetLength(S,Len);
  728. If (Buf<>Nil) and (Len>0) then
  729. Move (Buf[0],S[1],Len*sizeof(UnicodeChar));
  730. end;
  731. Procedure SetString (Out S : UnicodeString; Buf : PChar; Len : SizeInt);
  732. var
  733. BufLen: SizeInt;
  734. begin
  735. SetLength(S,Len);
  736. If (Buf<>Nil) and (Len>0) then
  737. widestringmanager.Ansi2UnicodeMoveProc(Buf,S,Len);
  738. end;
  739. {$ifndef FPUNONE}
  740. Function fpc_Val_Real_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): ValReal; [public, alias:'FPC_VAL_REAL_UNICODESTR']; compilerproc;
  741. Var
  742. SS : String;
  743. begin
  744. fpc_Val_Real_UnicodeStr := 0;
  745. if length(S) > 255 then
  746. code := 256
  747. else
  748. begin
  749. SS := S;
  750. Val(SS,fpc_Val_Real_UnicodeStr,code);
  751. end;
  752. end;
  753. {$endif}
  754. function fpc_val_enum_unicodestr(str2ordindex:pointer;const s:unicodestring;out code:valsint):longint;compilerproc;
  755. var ss:shortstring;
  756. begin
  757. if length(s)>255 then
  758. code:=256
  759. else
  760. begin
  761. ss:=s;
  762. val(ss,fpc_val_enum_unicodestr,code);
  763. end;
  764. end;
  765. Function fpc_Val_Currency_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): Currency; [public, alias:'FPC_VAL_CURRENCY_UNICODESTR']; compilerproc;
  766. Var
  767. SS : String;
  768. begin
  769. if length(S) > 255 then
  770. begin
  771. fpc_Val_Currency_UnicodeStr:=0;
  772. code := 256;
  773. end
  774. else
  775. begin
  776. SS := S;
  777. Val(SS,fpc_Val_Currency_UnicodeStr,code);
  778. end;
  779. end;
  780. Function fpc_Val_UInt_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): ValUInt; [public, alias:'FPC_VAL_UINT_UNICODESTR']; compilerproc;
  781. Var
  782. SS : ShortString;
  783. begin
  784. fpc_Val_UInt_UnicodeStr := 0;
  785. if length(S) > 255 then
  786. code := 256
  787. else
  788. begin
  789. SS := S;
  790. Val(SS,fpc_Val_UInt_UnicodeStr,code);
  791. end;
  792. end;
  793. Function fpc_Val_SInt_UnicodeStr (DestSize: SizeInt; Const S : UnicodeString; out Code : ValSInt): ValSInt; [public, alias:'FPC_VAL_SINT_UNICODESTR']; compilerproc;
  794. Var
  795. SS : ShortString;
  796. begin
  797. fpc_Val_SInt_UnicodeStr:=0;
  798. if length(S)>255 then
  799. code:=256
  800. else
  801. begin
  802. SS := S;
  803. fpc_Val_SInt_UnicodeStr := int_Val_SInt_ShortStr(DestSize,SS,Code);
  804. end;
  805. end;
  806. {$ifndef CPU64}
  807. Function fpc_Val_qword_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): qword; [public, alias:'FPC_VAL_QWORD_UNICODESTR']; compilerproc;
  808. Var
  809. SS : ShortString;
  810. begin
  811. fpc_Val_qword_UnicodeStr:=0;
  812. if length(S)>255 then
  813. code:=256
  814. else
  815. begin
  816. SS := S;
  817. Val(SS,fpc_Val_qword_UnicodeStr,Code);
  818. end;
  819. end;
  820. Function fpc_Val_int64_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): Int64; [public, alias:'FPC_VAL_INT64_UNICODESTR']; compilerproc;
  821. Var
  822. SS : ShortString;
  823. begin
  824. fpc_Val_int64_UnicodeStr:=0;
  825. if length(S)>255 then
  826. code:=256
  827. else
  828. begin
  829. SS := S;
  830. Val(SS,fpc_Val_int64_UnicodeStr,Code);
  831. end;
  832. end;
  833. {$endif CPU64}
  834. {$ifndef FPUNONE}
  835. procedure fpc_UnicodeStr_Float(d : ValReal;len,fr,rt : SizeInt;out s : UnicodeString);compilerproc;
  836. var
  837. ss : shortstring;
  838. begin
  839. str_real(len,fr,d,treal_type(rt),ss);
  840. s:=ss;
  841. end;
  842. {$endif}
  843. procedure fpc_unicodestr_enum(ordinal,len:sizeint;typinfo,ord2strindex:pointer;out s:unicodestring);compilerproc;
  844. var ss:shortstring;
  845. begin
  846. fpc_shortstr_enum(ordinal,len,typinfo,ord2strindex,ss);
  847. s:=ss;
  848. end;
  849. procedure fpc_unicodestr_bool(b : boolean;len:sizeint;out s:unicodestring);compilerproc;
  850. var ss:shortstring;
  851. begin
  852. fpc_shortstr_bool(b,len,ss);
  853. s:=ss;
  854. end;
  855. {$ifdef FPC_HAS_STR_CURRENCY}
  856. procedure fpc_UnicodeStr_Currency(c : Currency;len,fr : SizeInt;out s : UnicodeString);compilerproc;
  857. var
  858. ss : shortstring;
  859. begin
  860. str(c:len:fr,ss);
  861. s:=ss;
  862. end;
  863. {$endif FPC_HAS_STR_CURRENCY}
  864. Procedure fpc_UnicodeStr_SInt(v : ValSint; Len : SizeInt; out S : UnicodeString);compilerproc;
  865. Var
  866. SS : ShortString;
  867. begin
  868. Str (v:Len,SS);
  869. S:=SS;
  870. end;
  871. Procedure fpc_UnicodeStr_UInt(v : ValUInt;Len : SizeInt; out S : UnicodeString);compilerproc;
  872. Var
  873. SS : ShortString;
  874. begin
  875. str(v:Len,SS);
  876. S:=SS;
  877. end;
  878. {$ifndef CPU64}
  879. Procedure fpc_UnicodeStr_Int64(v : Int64; Len : SizeInt; out S : UnicodeString);compilerproc;
  880. Var
  881. SS : ShortString;
  882. begin
  883. Str (v:Len,SS);
  884. S:=SS;
  885. end;
  886. Procedure fpc_UnicodeStr_Qword(v : Qword;Len : SizeInt; out S : UnicodeString);compilerproc;
  887. Var
  888. SS : ShortString;
  889. begin
  890. str(v:Len,SS);
  891. S:=SS;
  892. end;
  893. {$endif CPU64}
  894. *)
  895. (*
  896. { converts an utf-16 code point or surrogate pair to utf-32 }
  897. function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
  898. var
  899. w: unicodechar;
  900. begin
  901. { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
  902. { are the same in UTF-32 }
  903. w:=s[index];
  904. if (w<=#$d7ff) or
  905. (w>=#$e000) then
  906. begin
  907. result:=UCS4Char(w);
  908. len:=1;
  909. end
  910. { valid surrogate pair? }
  911. else if (w<=#$dbff) and
  912. { w>=#$d7ff check not needed, checked above }
  913. (index<length(s)) and
  914. (s[index+1]>=#$dc00) and
  915. (s[index+1]<=#$dfff) then
  916. { convert the surrogate pair to UTF-32 }
  917. begin
  918. result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
  919. len:=2;
  920. end
  921. else
  922. { invalid surrogate -> do nothing }
  923. begin
  924. result:=UCS4Char(w);
  925. len:=1;
  926. end;
  927. end;
  928. function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  929. begin
  930. if assigned(Source) then
  931. Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
  932. else
  933. Result:=0;
  934. end;
  935. function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar; SourceChars: SizeUInt): SizeUInt;
  936. var
  937. i,j : SizeUInt;
  938. w : word;
  939. lw : longword;
  940. len : longint;
  941. begin
  942. result:=0;
  943. if source=nil then
  944. exit;
  945. i:=0;
  946. j:=0;
  947. if assigned(Dest) then
  948. begin
  949. while (i<SourceChars) and (j<MaxDestBytes) do
  950. begin
  951. w:=word(Source[i]);
  952. case w of
  953. 0..$7f:
  954. begin
  955. Dest[j]:=char(w);
  956. inc(j);
  957. end;
  958. $80..$7ff:
  959. begin
  960. if j+1>=MaxDestBytes then
  961. break;
  962. Dest[j]:=char($c0 or (w shr 6));
  963. Dest[j+1]:=char($80 or (w and $3f));
  964. inc(j,2);
  965. end;
  966. $800..$d7ff,$e000..$ffff:
  967. begin
  968. if j+2>=MaxDestBytes then
  969. break;
  970. Dest[j]:=char($e0 or (w shr 12));
  971. Dest[j+1]:=char($80 or ((w shr 6) and $3f));
  972. Dest[j+2]:=char($80 or (w and $3f));
  973. inc(j,3);
  974. end;
  975. $d800..$dbff:
  976. {High Surrogates}
  977. begin
  978. if j+3>=MaxDestBytes then
  979. break;
  980. if (i<sourcechars-1) and
  981. (word(Source[i+1]) >= $dc00) and
  982. (word(Source[i+1]) <= $dfff) then
  983. begin
  984. lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
  985. Dest[j]:=char($f0 or (lw shr 18));
  986. Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
  987. Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
  988. Dest[j+3]:=char($80 or (lw and $3f));
  989. inc(j,4);
  990. inc(i);
  991. end;
  992. end;
  993. end;
  994. inc(i);
  995. end;
  996. if j>SizeUInt(MaxDestBytes-1) then
  997. j:=MaxDestBytes-1;
  998. Dest[j]:=#0;
  999. end
  1000. else
  1001. begin
  1002. while i<SourceChars do
  1003. begin
  1004. case word(Source[i]) of
  1005. $0..$7f:
  1006. inc(j);
  1007. $80..$7ff:
  1008. inc(j,2);
  1009. $800..$d7ff,$e000..$ffff:
  1010. inc(j,3);
  1011. $d800..$dbff:
  1012. begin
  1013. if (i<sourcechars-1) and
  1014. (word(Source[i+1]) >= $dc00) and
  1015. (word(Source[i+1]) <= $dfff) then
  1016. begin
  1017. inc(j,4);
  1018. inc(i);
  1019. end;
  1020. end;
  1021. end;
  1022. inc(i);
  1023. end;
  1024. end;
  1025. result:=j+1;
  1026. end;
  1027. function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1028. begin
  1029. if assigned(Source) then
  1030. Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
  1031. else
  1032. Result:=0;
  1033. end;
  1034. function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
  1035. const
  1036. UNICODE_INVALID=63;
  1037. var
  1038. InputUTF8: SizeUInt;
  1039. IBYTE: BYTE;
  1040. OutputUnicode: SizeUInt;
  1041. PRECHAR: SizeUInt;
  1042. TempBYTE: BYTE;
  1043. CharLen: SizeUint;
  1044. LookAhead: SizeUInt;
  1045. UC: SizeUInt;
  1046. begin
  1047. if not assigned(Source) then
  1048. begin
  1049. result:=0;
  1050. exit;
  1051. end;
  1052. result:=SizeUInt(-1);
  1053. InputUTF8:=0;
  1054. OutputUnicode:=0;
  1055. PreChar:=0;
  1056. if Assigned(Dest) Then
  1057. begin
  1058. while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
  1059. begin
  1060. IBYTE:=byte(Source[InputUTF8]);
  1061. if (IBYTE and $80) = 0 then
  1062. begin
  1063. //One character US-ASCII, convert it to unicode
  1064. if IBYTE = 10 then
  1065. begin
  1066. If (PreChar<>13) and FALSE then
  1067. begin
  1068. //Expand to crlf, conform UTF-8.
  1069. //This procedure will break the memory alocation by
  1070. //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
  1071. if OutputUnicode+1<MaxDestChars then
  1072. begin
  1073. Dest[OutputUnicode]:=WideChar(13);
  1074. inc(OutputUnicode);
  1075. Dest[OutputUnicode]:=WideChar(10);
  1076. inc(OutputUnicode);
  1077. PreChar:=10;
  1078. end
  1079. else
  1080. begin
  1081. Dest[OutputUnicode]:=WideChar(13);
  1082. inc(OutputUnicode);
  1083. end;
  1084. end
  1085. else
  1086. begin
  1087. Dest[OutputUnicode]:=WideChar(IBYTE);
  1088. inc(OutputUnicode);
  1089. PreChar:=IBYTE;
  1090. end;
  1091. end
  1092. else
  1093. begin
  1094. Dest[OutputUnicode]:=WideChar(IBYTE);
  1095. inc(OutputUnicode);
  1096. PreChar:=IBYTE;
  1097. end;
  1098. inc(InputUTF8);
  1099. end
  1100. else
  1101. begin
  1102. TempByte:=IBYTE;
  1103. CharLen:=0;
  1104. while (TempBYTE and $80)<>0 do
  1105. begin
  1106. TempBYTE:=(TempBYTE shl 1) and $FE;
  1107. inc(CharLen);
  1108. end;
  1109. //Test for the "CharLen" conforms UTF-8 string
  1110. //This means the 10xxxxxx pattern.
  1111. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
  1112. begin
  1113. //Insuficient chars in string to decode
  1114. //UTF-8 array. Fallback to single char.
  1115. CharLen:= 1;
  1116. end;
  1117. for LookAhead := 1 to CharLen-1 do
  1118. begin
  1119. if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
  1120. ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
  1121. begin
  1122. //Invalid UTF-8 sequence, fallback.
  1123. CharLen:= LookAhead;
  1124. break;
  1125. end;
  1126. end;
  1127. UC:=$FFFF;
  1128. case CharLen of
  1129. 1: begin
  1130. //Not valid UTF-8 sequence
  1131. UC:=UNICODE_INVALID;
  1132. end;
  1133. 2: begin
  1134. //Two bytes UTF, convert it
  1135. UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
  1136. UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
  1137. if UC <= $7F then
  1138. begin
  1139. //Invalid UTF sequence.
  1140. UC:=UNICODE_INVALID;
  1141. end;
  1142. end;
  1143. 3: begin
  1144. //Three bytes, convert it to unicode
  1145. UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
  1146. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
  1147. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
  1148. if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
  1149. begin
  1150. //Invalid UTF-8 sequence
  1151. UC:= UNICODE_INVALID;
  1152. End;
  1153. end;
  1154. 4: begin
  1155. //Four bytes, convert it to two unicode characters
  1156. UC:= (byte(Source[InputUTF8]) and $07) shl 18;
  1157. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
  1158. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
  1159. UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
  1160. if (UC < $10000) or (UC > $10FFFF) then
  1161. begin
  1162. UC:= UNICODE_INVALID;
  1163. end
  1164. else
  1165. begin
  1166. { only store pair if room }
  1167. dec(UC,$10000);
  1168. if (OutputUnicode<MaxDestChars-1) then
  1169. begin
  1170. Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
  1171. inc(OutputUnicode);
  1172. UC:=(UC and $3ff) + $DC00;
  1173. end
  1174. else
  1175. begin
  1176. InputUTF8:= InputUTF8 + CharLen;
  1177. { don't store anything }
  1178. CharLen:=0;
  1179. end;
  1180. end;
  1181. end;
  1182. 5,6,7: begin
  1183. //Invalid UTF8 to unicode conversion,
  1184. //mask it as invalid UNICODE too.
  1185. UC:=UNICODE_INVALID;
  1186. end;
  1187. end;
  1188. if CharLen > 0 then
  1189. begin
  1190. PreChar:=UC;
  1191. Dest[OutputUnicode]:=WideChar(UC);
  1192. inc(OutputUnicode);
  1193. end;
  1194. InputUTF8:= InputUTF8 + CharLen;
  1195. end;
  1196. end;
  1197. Result:=OutputUnicode+1;
  1198. end
  1199. else
  1200. begin
  1201. while (InputUTF8<SourceBytes) do
  1202. begin
  1203. IBYTE:=byte(Source[InputUTF8]);
  1204. if (IBYTE and $80) = 0 then
  1205. begin
  1206. //One character US-ASCII, convert it to unicode
  1207. if IBYTE = 10 then
  1208. begin
  1209. if (PreChar<>13) and FALSE then
  1210. begin
  1211. //Expand to crlf, conform UTF-8.
  1212. //This procedure will break the memory alocation by
  1213. //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
  1214. inc(OutputUnicode,2);
  1215. PreChar:=10;
  1216. end
  1217. else
  1218. begin
  1219. inc(OutputUnicode);
  1220. PreChar:=IBYTE;
  1221. end;
  1222. end
  1223. else
  1224. begin
  1225. inc(OutputUnicode);
  1226. PreChar:=IBYTE;
  1227. end;
  1228. inc(InputUTF8);
  1229. end
  1230. else
  1231. begin
  1232. TempByte:=IBYTE;
  1233. CharLen:=0;
  1234. while (TempBYTE and $80)<>0 do
  1235. begin
  1236. TempBYTE:=(TempBYTE shl 1) and $FE;
  1237. inc(CharLen);
  1238. end;
  1239. //Test for the "CharLen" conforms UTF-8 string
  1240. //This means the 10xxxxxx pattern.
  1241. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
  1242. begin
  1243. //Insuficient chars in string to decode
  1244. //UTF-8 array. Fallback to single char.
  1245. CharLen:= 1;
  1246. end;
  1247. for LookAhead := 1 to CharLen-1 do
  1248. begin
  1249. if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
  1250. ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
  1251. begin
  1252. //Invalid UTF-8 sequence, fallback.
  1253. CharLen:= LookAhead;
  1254. break;
  1255. end;
  1256. end;
  1257. UC:=$FFFF;
  1258. case CharLen of
  1259. 1: begin
  1260. //Not valid UTF-8 sequence
  1261. UC:=UNICODE_INVALID;
  1262. end;
  1263. 2: begin
  1264. //Two bytes UTF, convert it
  1265. UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
  1266. UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
  1267. if UC <= $7F then
  1268. begin
  1269. //Invalid UTF sequence.
  1270. UC:=UNICODE_INVALID;
  1271. end;
  1272. end;
  1273. 3: begin
  1274. //Three bytes, convert it to unicode
  1275. UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
  1276. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
  1277. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
  1278. If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
  1279. begin
  1280. //Invalid UTF-8 sequence
  1281. UC:= UNICODE_INVALID;
  1282. end;
  1283. end;
  1284. 4: begin
  1285. //Four bytes, convert it to two unicode characters
  1286. UC:= (byte(Source[InputUTF8]) and $07) shl 18;
  1287. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
  1288. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
  1289. UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
  1290. if (UC < $10000) or (UC > $10FFFF) then
  1291. UC:= UNICODE_INVALID
  1292. else
  1293. { extra character character }
  1294. inc(OutputUnicode);
  1295. end;
  1296. 5,6,7: begin
  1297. //Invalid UTF8 to unicode conversion,
  1298. //mask it as invalid UNICODE too.
  1299. UC:=UNICODE_INVALID;
  1300. end;
  1301. end;
  1302. if CharLen > 0 then
  1303. begin
  1304. PreChar:=UC;
  1305. inc(OutputUnicode);
  1306. end;
  1307. InputUTF8:= InputUTF8 + CharLen;
  1308. end;
  1309. end;
  1310. Result:=OutputUnicode+1;
  1311. end;
  1312. end;
  1313. function UTF8Encode(const s : Ansistring) : UTF8String; inline;
  1314. begin
  1315. Result:=UTF8Encode(UnicodeString(s));
  1316. end;
  1317. function UTF8Encode(const s : UnicodeString) : UTF8String;
  1318. var
  1319. i : SizeInt;
  1320. hs : UTF8String;
  1321. begin
  1322. result:='';
  1323. if s='' then
  1324. exit;
  1325. SetLength(hs,length(s)*3);
  1326. i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PUnicodeChar(s),length(s));
  1327. if i>0 then
  1328. begin
  1329. SetLength(hs,i-1);
  1330. result:=hs;
  1331. end;
  1332. end;
  1333. function UTF8Decode(const s : UTF8String): UnicodeString;
  1334. var
  1335. i : SizeInt;
  1336. hs : UnicodeString;
  1337. begin
  1338. result:='';
  1339. if s='' then
  1340. exit;
  1341. SetLength(hs,length(s));
  1342. i:=Utf8ToUnicode(PUnicodeChar(hs),length(hs)+1,pchar(s),length(s));
  1343. if i>0 then
  1344. begin
  1345. SetLength(hs,i-1);
  1346. result:=hs;
  1347. end;
  1348. end;
  1349. function AnsiToUtf8(const s : ansistring): UTF8String;{$ifdef SYSTEMINLINE}inline;{$endif}
  1350. begin
  1351. Result:=Utf8Encode(s);
  1352. end;
  1353. function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inline;{$endif}
  1354. begin
  1355. Result:=Utf8Decode(s);
  1356. end;
  1357. function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
  1358. var
  1359. i, slen,
  1360. destindex : SizeInt;
  1361. len : longint;
  1362. begin
  1363. slen:=length(s);
  1364. setlength(result,slen+1);
  1365. i:=1;
  1366. destindex:=0;
  1367. while (i<=slen) do
  1368. begin
  1369. result[destindex]:=utf16toutf32(s,i,len);
  1370. inc(destindex);
  1371. inc(i,len);
  1372. end;
  1373. { destindex <= slen (surrogate pairs may have been merged) }
  1374. { destindex+1 for terminating #0 (dynamic arrays are }
  1375. { implicitely filled with zero) }
  1376. setlength(result,destindex+1);
  1377. end;
  1378. { concatenates an utf-32 char to a unicodestring. S *must* be unique when entering. }
  1379. procedure ConcatUTF32ToUnicodeStr(const nc: UCS4Char; var S: UnicodeString; var index: SizeInt);
  1380. var
  1381. p : PUnicodeChar;
  1382. begin
  1383. { if nc > $ffff, we need two places }
  1384. if (index+ord(nc > $ffff)>length(s)) then
  1385. if (length(s) < 10*256) then
  1386. setlength(s,length(s)+10)
  1387. else
  1388. setlength(s,length(s)+length(s) shr 8);
  1389. { we know that s is unique -> avoid uniquestring calls}
  1390. p:=@s[index];
  1391. if (nc<$ffff) then
  1392. begin
  1393. p^:=unicodechar(nc);
  1394. inc(index);
  1395. end
  1396. else if (dword(nc)<=$10ffff) then
  1397. begin
  1398. p^:=unicodechar((nc - $10000) shr 10 + $d800);
  1399. (p+1)^:=unicodechar((nc - $10000) and $3ff + $dc00);
  1400. inc(index,2);
  1401. end
  1402. else
  1403. { invalid code point }
  1404. begin
  1405. p^:='?';
  1406. inc(index);
  1407. end;
  1408. end;
  1409. function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString;
  1410. var
  1411. i : SizeInt;
  1412. resindex : SizeInt;
  1413. begin
  1414. { skip terminating #0 }
  1415. SetLength(result,length(s)-1);
  1416. resindex:=1;
  1417. for i:=0 to high(s)-1 do
  1418. ConcatUTF32ToUnicodeStr(s[i],result,resindex);
  1419. { adjust result length (may be too big due to growing }
  1420. { for surrogate pairs) }
  1421. setlength(result,resindex-1);
  1422. end;
  1423. function WideStringToUCS4String(const s : WideString) : UCS4String;
  1424. var
  1425. i, slen,
  1426. destindex : SizeInt;
  1427. len : longint;
  1428. begin
  1429. slen:=length(s);
  1430. setlength(result,slen+1);
  1431. i:=1;
  1432. destindex:=0;
  1433. while (i<=slen) do
  1434. begin
  1435. result[destindex]:=utf16toutf32(s,i,len);
  1436. inc(destindex);
  1437. inc(i,len);
  1438. end;
  1439. { destindex <= slen (surrogate pairs may have been merged) }
  1440. { destindex+1 for terminating #0 (dynamic arrays are }
  1441. { implicitely filled with zero) }
  1442. setlength(result,destindex+1);
  1443. end;
  1444. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  1445. procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
  1446. var
  1447. p : PWideChar;
  1448. begin
  1449. { if nc > $ffff, we need two places }
  1450. if (index+ord(nc > $ffff)>length(s)) then
  1451. if (length(s) < 10*256) then
  1452. setlength(s,length(s)+10)
  1453. else
  1454. setlength(s,length(s)+length(s) shr 8);
  1455. { we know that s is unique -> avoid uniquestring calls}
  1456. p:=@s[index];
  1457. if (nc<$ffff) then
  1458. begin
  1459. p^:=widechar(nc);
  1460. inc(index);
  1461. end
  1462. else if (dword(nc)<=$10ffff) then
  1463. begin
  1464. p^:=widechar((nc - $10000) shr 10 + $d800);
  1465. (p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
  1466. inc(index,2);
  1467. end
  1468. else
  1469. { invalid code point }
  1470. begin
  1471. p^:='?';
  1472. inc(index);
  1473. end;
  1474. end;
  1475. function UCS4StringToWideString(const s : UCS4String) : WideString;
  1476. var
  1477. i : SizeInt;
  1478. resindex : SizeInt;
  1479. begin
  1480. { skip terminating #0 }
  1481. SetLength(result,length(s)-1);
  1482. resindex:=1;
  1483. for i:=0 to high(s)-1 do
  1484. ConcatUTF32ToWideStr(s[i],result,resindex);
  1485. { adjust result length (may be too big due to growing }
  1486. { for surrogate pairs) }
  1487. setlength(result,resindex-1);
  1488. end;
  1489. const
  1490. SNoUnicodestrings = 'This binary has no unicodestrings support compiled in.';
  1491. SRecompileWithUnicodestrings = 'Recompile the application with a unicodestrings-manager in the program uses clause.';
  1492. *)
  1493. function CompareUnicodeString(const s1, s2 : UnicodeString) : PtrInt;
  1494. begin
  1495. widestringmanager.collator.setStrength(JTCollator.IDENTICAL);
  1496. result:=widestringmanager.collator.compare(s1,s2);
  1497. end;
  1498. function CompareTextUnicodeString(const s1, s2 : UnicodeString): PtrInt;
  1499. begin
  1500. widestringmanager.collator.setStrength(JTCollator.TERTIARY);
  1501. result:=widestringmanager.collator.compare(s1,s2);
  1502. end;
  1503. constructor TUnicodeStringManager.create;
  1504. begin
  1505. end;
  1506. procedure initunicodestringmanager;
  1507. begin
  1508. widestringmanager:=TUnicodeStringManager.create;
  1509. widestringmanager.collator:=JTCollator.getInstance;
  1510. end;