ustrings.inc 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 1999-2005 by Florian Klaempfl,
  4. Copyright (c) 2011 by Jonas Maebe,
  5. members of the Free Pascal development team.
  6. This file implements support routines for UTF-8 strings with FPC
  7. See the file COPYING.FPC, included in this distribution,
  8. for details about the copyright.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12. **********************************************************************}
  13. {$i wustrings.inc}
  14. {
  15. This file contains the implementation of the UnicodeString type,
  16. which on the Java platforms is an alias for java.lang.String
  17. }
  18. Function NewUnicodeString(Len : SizeInt) : JLString;
  19. {
  20. Allocate a new UnicodeString on the heap.
  21. initialize it to zero length and reference count 1.
  22. }
  23. var
  24. data: array of jchar;
  25. begin
  26. setlength(data,len);
  27. result:=JLString.create(data);
  28. end;
  29. procedure fpc_UnicodeStr_To_ShortStr (out res: ShortString;const S2 : UnicodeString); [Public, alias: 'FPC_UNICODESTR_TO_SHORTSTR'];compilerproc;
  30. {
  31. Converts a UnicodeString to a ShortString;
  32. }
  33. Var
  34. Size : SizeInt;
  35. temp : ansistring;
  36. begin
  37. res:='';
  38. Size:=Length(S2);
  39. if Size>0 then
  40. begin
  41. temp:=s2;
  42. res:=temp;
  43. end;
  44. end;
  45. Function fpc_ShortStr_To_UnicodeStr (Const S2 : ShortString): UnicodeString;compilerproc;
  46. {
  47. Converts a ShortString to a UnicodeString;
  48. }
  49. Var
  50. Size : SizeInt;
  51. begin
  52. result:='';
  53. Size:=Length(S2);
  54. if Size>0 then
  55. result:=unicodestring(JLString.Create(TJByteArray(ShortstringClass(@S2).fdata),0,Size));
  56. end;
  57. Function fpc_UnicodeStr_To_AnsiStr (const S2 : UnicodeString): AnsiString; compilerproc;
  58. {
  59. Converts a UnicodeString to an AnsiString
  60. }
  61. Var
  62. Size : SizeInt;
  63. begin
  64. result:=Ansistring(AnsistringClass.Create(s2));
  65. end;
  66. Function fpc_AnsiStr_To_UnicodeStr (Const S2 : AnsiString): UnicodeString; compilerproc;
  67. {
  68. Converts an AnsiString to a UnicodeString;
  69. }
  70. Var
  71. Size : SizeInt;
  72. begin
  73. if length(s2)=0 then
  74. result:=''
  75. else
  76. result:=AnsistringClass(S2).toString;
  77. end;
  78. Function fpc_UnicodeStr_To_WideStr (const S2 : UnicodeString): WideString; compilerproc;
  79. begin
  80. result:=s2;
  81. end;
  82. Function fpc_WideStr_To_UnicodeStr (Const S2 : WideString): UnicodeString; compilerproc;
  83. begin
  84. result:=s2;
  85. end;
  86. function fpc_UnicodeStr_Concat (const S1,S2 : UnicodeString): UnicodeString; compilerproc;
  87. Var
  88. sb: JLStringBuilder;
  89. begin
  90. { only assign if s1 or s2 is empty }
  91. if (length(S1)=0) then
  92. begin
  93. result:=s2;
  94. exit;
  95. end;
  96. if (length(S2)=0) then
  97. begin
  98. result:=s1;
  99. exit;
  100. end;
  101. sb:=JLStringBuilder.create(S1);
  102. sb.append(s2);
  103. result:=sb.toString;
  104. end;
  105. function fpc_UnicodeStr_Concat_multi (const sarr:array of Unicodestring): unicodestring; compilerproc;
  106. Var
  107. i : Longint;
  108. Size,NewSize : SizeInt;
  109. sb: JLStringBuilder;
  110. begin
  111. { First calculate size of the result so we can allocate a StringBuilder of
  112. the right size }
  113. NewSize:=0;
  114. for i:=low(sarr) to high(sarr) do
  115. inc(Newsize,length(sarr[i]));
  116. sb:=JLStringBuilder.create(NewSize);
  117. for i:=low(sarr) to high(sarr) do
  118. begin
  119. if length(sarr[i])>0 then
  120. sb.append(sarr[i]);
  121. end;
  122. result:=sb.toString;
  123. end;
  124. Function fpc_Char_To_UChar(const c : AnsiChar): UnicodeChar; compilerproc;
  125. var
  126. str: JLString;
  127. arr: array of jbyte;
  128. begin
  129. setlength(arr,1);
  130. arr[0]:=ord(c);
  131. result:=JLString.create(arr,0,1).charAt(0);
  132. end;
  133. Function fpc_Char_To_UnicodeStr(const c : AnsiChar): UnicodeString; compilerproc;
  134. {
  135. Converts a AnsiChar to a UnicodeString;
  136. }
  137. var
  138. str: JLString;
  139. arr: array of jbyte;
  140. begin
  141. setlength(arr,1);
  142. arr[0]:=ord(c);
  143. result:=JLString.create(arr,0,1);
  144. end;
  145. Function fpc_UChar_To_Char(const c : UnicodeChar): AnsiChar; compilerproc;
  146. {
  147. Converts a UnicodeChar to a AnsiChar;
  148. }
  149. var
  150. arrb: array of jbyte;
  151. arrw: array of jchar;
  152. str: JLString;
  153. begin
  154. setlength(arrw,1);
  155. arrw[0]:=c;
  156. str:=JLString.create(arrw);
  157. arrb:=str.getbytes();
  158. result:=chr(arrb[0]);
  159. end;
  160. Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
  161. {
  162. Converts a WideChar to a UnicodeString;
  163. }
  164. var
  165. arrw: array of jchar;
  166. begin
  167. setlength(arrw,1);
  168. arrw[0]:=c;
  169. result:=JLString.create(arrw);
  170. end;
  171. Function fpc_Char_To_WChar(const c : AnsiChar): WideChar; compilerproc;
  172. {
  173. Converts a AnsiChar to a WideChar;
  174. }
  175. var
  176. str: JLString;
  177. arr: array of jbyte;
  178. begin
  179. setlength(arr,1);
  180. arr[0]:=ord(c);
  181. result:=JLString.create(arr,0,1).charAt(0);
  182. end;
  183. Function fpc_WChar_To_Char(const c : WideChar): AnsiChar; compilerproc;
  184. {
  185. Converts a WideChar to a AnsiChar;
  186. }
  187. var
  188. arrb: array of jbyte;
  189. arrw: array of jchar;
  190. begin
  191. setlength(arrw,1);
  192. arrw[0]:=c;
  193. arrb:=JLString.create(arrw).getbytes();
  194. result:=chr(arrb[0]);
  195. end;
  196. procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
  197. {
  198. Converts a WideChar to a ShortString;
  199. }
  200. var
  201. u: unicodestring;
  202. begin
  203. u:=c;
  204. res:=u;
  205. end;
  206. Function fpc_UChar_To_UnicodeStr(const c : UnicodeChar): UnicodeString; compilerproc;
  207. {
  208. Converts a UnicodeChar to a UnicodeString;
  209. }
  210. var
  211. arr: array[0..0] of UnicodeChar;
  212. begin
  213. arr[0]:=c;
  214. result:=JLString.create(arr);
  215. end;
  216. Function fpc_UChar_To_AnsiStr(const c : UnicodeChar): AnsiString; compilerproc;
  217. {
  218. Converts a UnicodeChar to a AnsiString;
  219. }
  220. var
  221. u: unicodestring;
  222. begin
  223. u:=c;
  224. result:=u;
  225. end;
  226. (*
  227. Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
  228. Var
  229. L : SizeInt;
  230. begin
  231. if (not assigned(p)) or (p[0]=#0) Then
  232. begin
  233. fpc_pchar_to_unicodestr := '';
  234. exit;
  235. end;
  236. l:=IndexChar(p^,-1,#0);
  237. widestringmanager.Ansi2UnicodeMoveProc(P,fpc_PChar_To_UnicodeStr,l);
  238. end;
  239. *)
  240. Function fpc_CharArray_To_UnicodeStr(const arr: array of ansichar; zerobased: boolean = true): UnicodeString; compilerproc;
  241. var
  242. i,j : SizeInt;
  243. localarr: array of jbyte;
  244. foundnull: boolean;
  245. begin
  246. if (zerobased) then
  247. begin
  248. if (arr[0]=#0) Then
  249. begin
  250. fpc_chararray_to_unicodestr := '';
  251. exit;
  252. end;
  253. foundnull:=false;
  254. for i:=low(arr) to high(arr) do
  255. if arr[i]=#0 then
  256. begin
  257. foundnull:=true;
  258. break;
  259. end;
  260. if not foundnull then
  261. i := high(arr)+1;
  262. end
  263. else
  264. i := high(arr)+1;
  265. setlength(localarr,i);
  266. for j:=0 to i-1 do
  267. localarr[j]:=ord(arr[j]);
  268. result:=JLString.create(localarr,0,i);
  269. end;
  270. (*
  271. function fpc_UnicodeCharArray_To_ShortStr(const arr: array of unicodechar; zerobased: boolean = true): shortstring;[public,alias:'FPC_UNICODECHARARRAY_TO_SHORTSTR']; compilerproc;
  272. var
  273. l: longint;
  274. index: longint;
  275. len: byte;
  276. temp: ansistring;
  277. foundnull: boolean;
  278. begin
  279. l := high(arr)+1;
  280. if l>=256 then
  281. l:=255
  282. else if l<0 then
  283. l:=0;
  284. if zerobased then
  285. begin
  286. foundnull:=false;
  287. for index:=low(arr) to l-1 do
  288. if arr[index]=#0 then
  289. begin
  290. foundnull:=true;
  291. break;
  292. end;
  293. if not foundnull then
  294. len := l
  295. else
  296. len := index;
  297. end
  298. else
  299. len := l;
  300. result:=JLString.create(arr,0,l);
  301. end;
  302. Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; zerobased: boolean = true): AnsiString; compilerproc;
  303. var
  304. i : SizeInt;
  305. begin
  306. if (zerobased) then
  307. begin
  308. i:=IndexWord(arr,high(arr)+1,0);
  309. if i = -1 then
  310. i := high(arr)+1;
  311. end
  312. else
  313. i := high(arr)+1;
  314. SetLength(fpc_UnicodeCharArray_To_AnsiStr,i);
  315. widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),fpc_UnicodeCharArray_To_AnsiStr,i);
  316. end;
  317. *)
  318. Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
  319. var
  320. i : SizeInt;
  321. foundnull : boolean;
  322. begin
  323. if (zerobased) then
  324. begin
  325. foundnull:=false;
  326. for i:=low(arr) to high(arr) do
  327. if arr[i]=#0 then
  328. begin
  329. foundnull:=true;
  330. break;
  331. end;
  332. if not foundnull then
  333. i := high(arr)+1;
  334. end
  335. else
  336. i := high(arr)+1;
  337. result:=JLString.create(arr,0,i);
  338. end;
  339. Function real_widechararray_to_unicodestr(const arr: array of widechar; zerobased: boolean): Unicodestring;
  340. var
  341. i : SizeInt;
  342. foundnull : boolean;
  343. begin
  344. if (zerobased) then
  345. begin
  346. foundnull:=false;
  347. for i:=low(arr) to high(arr) do
  348. if arr[i]=#0 then
  349. begin
  350. foundnull:=true;
  351. break;
  352. end;
  353. if not foundnull then
  354. i := high(arr)+1;
  355. end
  356. else
  357. i := high(arr)+1;
  358. result:=JLString.create(arr,0,i);
  359. end;
  360. Function fpc_WideCharArray_To_UnicodeStr(const arr: array of widechar; zerobased: boolean = true): UnicodeString; compilerproc;
  361. begin
  362. result:=real_widechararray_to_unicodestr(arr,zerobased);
  363. end;
  364. { due to their names, the following procedures should be in wstrings.inc,
  365. however, the compiler generates code using this functions on all platforms }
  366. procedure fpc_WideCharArray_To_ShortStr(out res : shortstring;const arr: array of widechar; zerobased: boolean = true);[public,alias:'FPC_WIDECHARARRAY_TO_SHORTSTR']; compilerproc;
  367. begin
  368. res:=real_widechararray_to_unicodestr(arr,zerobased);
  369. end;
  370. Function fpc_WideCharArray_To_AnsiStr(const arr: array of widechar; zerobased: boolean = true): AnsiString; compilerproc;
  371. begin
  372. result:=real_widechararray_to_unicodestr(arr,zerobased);
  373. end;
  374. procedure fpc_unicodestr_to_chararray(out res: array of AnsiChar; const src: UnicodeString); compilerproc;
  375. var
  376. len: longint;
  377. temp: array of jbyte;
  378. begin
  379. len:=length(src);
  380. { make sure we don't dereference src if it can be nil (JM) }
  381. if len>0 then
  382. begin
  383. temp:=JLString(src).getBytes;
  384. len:=length(temp);
  385. if len>length(res) then
  386. len:=length(res);
  387. JLSystem.ArrayCopy(JLObject(temp),0,JLObject(@res),0,len);
  388. end;
  389. if len<=high(res) then
  390. JUArrays.fill(TJByteArray(@res),len,high(res),0);
  391. end;
  392. procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
  393. var
  394. len: SizeInt;
  395. begin
  396. len := length(src);
  397. { make sure we don't dereference src if it can be nil (JM) }
  398. if len > 0 then
  399. begin
  400. if len > high(res)+1 then
  401. len := high(res)+1;
  402. JLString(src).getChars(0,len,res,0);
  403. end;
  404. end;
  405. function fpc_unicodestr_setchar(const s: UnicodeString; const index: longint; const ch: unicodechar): UnicodeString; compilerproc;
  406. var
  407. sb: JLStringBuilder;
  408. begin
  409. sb:=JLStringBuilder.create(s);
  410. { string indexes are 1-based in Pascal, 0-based in Java }
  411. sb.setCharAt(index-1,ch);
  412. result:=sb.toString();
  413. end;
  414. procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
  415. var
  416. len: SizeInt;
  417. temp: unicodestring;
  418. begin
  419. len := length(src);
  420. { make sure we don't dereference src if it can be nil (JM) }
  421. if len > 0 then
  422. temp:=src;
  423. len := length(temp);
  424. if len > length(res) then
  425. len := length(res);
  426. JLString(temp).getChars(0,len,res,0);
  427. JUArrays.fill(res,len,high(res),#0);
  428. end;
  429. (*
  430. procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
  431. var
  432. len: longint;
  433. temp : unicodestring;
  434. begin
  435. len := length(src);
  436. { temp is initialized with an empty string, so no need to convert src in case
  437. it's also empty}
  438. if len > 0 then
  439. temp:=src;
  440. len := length(temp);
  441. if len > high(res)+1 then
  442. len := high(res)+1;
  443. JLString(temp).getChars(0,len,res,0);
  444. JUArrays.fill(res,len,high(res),#0);
  445. end;
  446. *)
  447. procedure fpc_ansistr_to_widechararray(out res: array of widechar; const src: AnsiString); compilerproc;
  448. var
  449. len: SizeInt;
  450. temp: widestring;
  451. begin
  452. len := length(src);
  453. { make sure we don't dereference src if it can be nil (JM) }
  454. if len > 0 then
  455. temp:=src;
  456. len := length(temp);
  457. if len > high(res)+1 then
  458. len := high(res)+1;
  459. JLString(temp).getChars(0,len,res,0);
  460. JUArrays.fill(res,len,high(res),#0);
  461. end;
  462. procedure fpc_shortstr_to_widechararray(out res: array of widechar; const src: ShortString); compilerproc;
  463. var
  464. len: longint;
  465. temp : unicodestring;
  466. begin
  467. len := length(src);
  468. { temp is initialized with an empty string, so no need to convert src in case
  469. it's also empty}
  470. if len > 0 then
  471. temp:=src;
  472. len := length(temp);
  473. if len > high(res)+1 then
  474. len := high(res)+1;
  475. JLString(temp).getChars(0,len,res,0);
  476. JUArrays.fill(res,len,high(res),#0);
  477. end;
  478. procedure fpc_unicodestr_to_widechararray(out res: array of widechar; const src: UnicodeString); compilerproc;
  479. var
  480. i, len: SizeInt;
  481. begin
  482. len := length(src);
  483. if len > length(res) then
  484. len := length(res);
  485. JLString(src).getChars(0,len,res,0);
  486. end;
  487. Function fpc_UnicodeStr_Compare(const S1,S2 : UnicodeString): SizeInt; compilerproc;
  488. {
  489. Compares 2 UnicodeStrings;
  490. The result is
  491. <0 if S1<S2
  492. 0 if S1=S2
  493. >0 if S1>S2
  494. }
  495. Var
  496. MaxI,Temp : SizeInt;
  497. begin
  498. if JLObject(S1)=JLObject(S2) then
  499. begin
  500. result:=0;
  501. exit;
  502. end;
  503. result:=JLString(S1).compareTo(S2);
  504. end;
  505. Function fpc_UnicodeStr_Compare_Equal(const S1,S2 : UnicodeString): SizeInt; compilerproc;
  506. {
  507. Compares 2 UnicodeStrings for equality only;
  508. The result is
  509. 0 if S1=S2
  510. <>0 if S1<>S2
  511. }
  512. Var
  513. MaxI : SizeInt;
  514. begin
  515. result:=ord(not JLString(S1).equals(JLString(S2)));
  516. end;
  517. function fpc_UnicodeStr_SetLength(const S : UnicodeString; l : SizeInt): UnicodeString; compilerproc;
  518. {
  519. Sets The length of string S to L.
  520. Makes sure S is unique, and contains enough room.
  521. Returns new val
  522. }
  523. Var
  524. movelen: SizeInt;
  525. chars: array of widechar;
  526. strlen: SizeInt;
  527. begin
  528. if (l>0) then
  529. begin
  530. if JLObject(S)=nil then
  531. begin
  532. { Need a completely new string...}
  533. result:=NewUnicodeString(l);
  534. end
  535. { no need to create a new string, since Java strings are immutable }
  536. else
  537. begin
  538. strlen:=length(s);
  539. if l=strlen then
  540. result:=s
  541. else if (l<strlen) then
  542. result:=JLString(s).substring(0,l)
  543. else
  544. begin
  545. setlength(chars,l);
  546. JLString(s).getChars(0,strlen,chars,0);
  547. result:=JLString.create(chars,0,l)
  548. end;
  549. end
  550. end
  551. else
  552. begin
  553. result:='';
  554. end;
  555. end;
  556. {*****************************************************************************
  557. Public functions, In interface.
  558. *****************************************************************************}
  559. (*
  560. function UnicodeCharToString(S : PUnicodeChar) : AnsiString;
  561. begin
  562. result:=UnicodeCharLenToString(s,Length(UnicodeString(s)));
  563. end;
  564. function StringToUnicodeChar(const Src : AnsiString;Dest : PUnicodeChar;DestSize : SizeInt) : PUnicodeChar;
  565. var
  566. temp:unicodestring;
  567. begin
  568. widestringmanager.Ansi2UnicodeMoveProc(PChar(Src),temp,Length(Src));
  569. if Length(temp)<DestSize then
  570. move(temp[1],Dest^,Length(temp)*SizeOf(UnicodeChar))
  571. else
  572. move(temp[1],Dest^,(DestSize-1)*SizeOf(UnicodeChar));
  573. Dest[DestSize-1]:=#0;
  574. result:=Dest;
  575. end;
  576. function WideCharToString(S : PWideChar) : AnsiString;
  577. begin
  578. result:=WideCharLenToString(s,Length(WideString(s)));
  579. end;
  580. function StringToWideChar(const Src : AnsiString;Dest : PWideChar;DestSize : SizeInt) : PWideChar;
  581. var
  582. temp:widestring;
  583. begin
  584. widestringmanager.Ansi2WideMoveProc(PChar(Src),temp,Length(Src));
  585. if Length(temp)<DestSize then
  586. move(temp[1],Dest^,Length(temp)*SizeOf(WideChar))
  587. else
  588. move(temp[1],Dest^,(DestSize-1)*SizeOf(WideChar));
  589. Dest[DestSize-1]:=#0;
  590. result:=Dest;
  591. end;
  592. function UnicodeCharLenToString(S : PUnicodeChar;Len : SizeInt) : AnsiString;
  593. begin
  594. //SetLength(result,Len);
  595. widestringmanager.Unicode2AnsiMoveproc(S,result,Len);
  596. end;
  597. procedure UnicodeCharLenToStrVar(Src : PUnicodeChar;Len : SizeInt;out Dest : AnsiString);
  598. begin
  599. Dest:=UnicodeCharLenToString(Src,Len);
  600. end;
  601. procedure UnicodeCharToStrVar(S : PUnicodeChar;out Dest : AnsiString);
  602. begin
  603. Dest:=UnicodeCharToString(S);
  604. end;
  605. function WideCharLenToString(S : PWideChar;Len : SizeInt) : AnsiString;
  606. begin
  607. //SetLength(result,Len);
  608. widestringmanager.Wide2AnsiMoveproc(S,result,Len);
  609. end;
  610. procedure WideCharLenToStrVar(Src : PWideChar;Len : SizeInt;out Dest : AnsiString);
  611. begin
  612. Dest:=WideCharLenToString(Src,Len);
  613. end;
  614. procedure WideCharToStrVar(S : PWideChar;out Dest : AnsiString);
  615. begin
  616. Dest:=WideCharToString(S);
  617. end;
  618. *)
  619. Function fpc_unicodestr_Unique(const S : JLObject): JLObject; compilerproc;
  620. begin
  621. result:=s;
  622. end;
  623. Function Fpc_UnicodeStr_Copy (Const S : UnicodeString; Index,Size : SizeInt) : UnicodeString;compilerproc;
  624. begin
  625. dec(index);
  626. if Index < 0 then
  627. Index := 0;
  628. { Check Size. Accounts for Zero-length S, the double check is needed because
  629. Size can be maxint and will get <0 when adding index }
  630. if (Size>Length(S)) or
  631. (Index+Size>Length(S)) then
  632. Size:=Length(S)-Index;
  633. If Size>0 then
  634. result:=JLString(s).subString(Index,Size)
  635. else
  636. result:='';
  637. end;
  638. Function Pos (Const Substr : UnicodeString; Const Source : UnicodeString) : SizeInt;
  639. begin
  640. Pos:=0;
  641. if Length(SubStr)>0 then
  642. Pos:=JLString(Source).indexOf(SubStr)+1;
  643. end;
  644. { Faster version for a unicodechar alone }
  645. Function Pos (c : UnicodeChar; Const s : UnicodeString) : SizeInt;
  646. begin
  647. Pos:=0;
  648. if length(S)>0 then
  649. Pos:=JLString(s).indexOf(ord(c))+1;
  650. end;
  651. (*
  652. Function Pos (c : AnsiString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  653. begin
  654. result:=Pos(UnicodeString(c),s);
  655. end;
  656. Function Pos (c : ShortString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  657. begin
  658. result:=Pos(UnicodeString(c),s);
  659. end;
  660. Function Pos (c : UnicodeString; Const s : AnsiString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  661. begin
  662. result:=Pos(c,UnicodeString(s));
  663. end;
  664. *)
  665. { Faster version for a char alone. Must be implemented because }
  666. { pos(c: char; const s: shortstring) also exists, so otherwise }
  667. { using pos(char,pchar) will always call the shortstring version }
  668. { (exact match for first argument), also with $h+ (JM) }
  669. Function Pos (c : AnsiChar; Const s : UnicodeString) : SizeInt;
  670. var
  671. i: SizeInt;
  672. wc : unicodechar;
  673. begin
  674. wc:=c;
  675. result:=Pos(wc,s);
  676. end;
  677. (*
  678. Procedure Delete (Var S : UnicodeString; Index,Size: SizeInt);
  679. Var
  680. LS : SizeInt;
  681. sb: JLStringBuilder;
  682. begin
  683. LS:=Length(S);
  684. if (Index>LS) or (Index<=0) or (Size<=0) then
  685. exit;
  686. { (Size+Index) will overflow if Size=MaxInt. }
  687. if Size>LS-Index then
  688. Size:=LS-Index+1;
  689. if Size<=LS-Index then
  690. begin
  691. Dec(Index);
  692. sb:=JLStringBuilder.Create(s);
  693. sb.delete(index,size);
  694. s:=sb.toString;
  695. end
  696. else
  697. s:=JLString(s).substring(0,index-1);
  698. end;
  699. Procedure Insert (Const Source : UnicodeString; Var S : UnicodeString; Index : SizeInt);
  700. var
  701. Temp : UnicodeString;
  702. LS : SizeInt;
  703. sb : JLStringBuilder;
  704. begin
  705. If Length(Source)=0 then
  706. exit;
  707. if index <= 0 then
  708. index := 1;
  709. Ls:=Length(S);
  710. if index > LS then
  711. index := LS+1;
  712. Dec(Index);
  713. sb:=JLStringBuilder.Create(S);
  714. sb.insert(Index,Source);
  715. S:=sb.toString;
  716. end;
  717. *)
  718. Function UpCase(c:UnicodeChar):UnicodeChar;
  719. begin
  720. result:=JLCharacter.toUpperCase(c);
  721. end;
  722. function UpCase(const s : UnicodeString) : UnicodeString;
  723. begin
  724. result:=JLString(s).toUpperCase;
  725. end;
  726. (*
  727. Procedure SetString (Out S : UnicodeString; Buf : PUnicodeChar; Len : SizeInt);
  728. begin
  729. SetLength(S,Len);
  730. If (Buf<>Nil) and (Len>0) then
  731. Move (Buf[0],S[1],Len*sizeof(UnicodeChar));
  732. end;
  733. Procedure SetString (Out S : UnicodeString; Buf : PChar; Len : SizeInt);
  734. var
  735. BufLen: SizeInt;
  736. begin
  737. SetLength(S,Len);
  738. If (Buf<>Nil) and (Len>0) then
  739. widestringmanager.Ansi2UnicodeMoveProc(Buf,S,Len);
  740. end;
  741. {$ifndef FPUNONE}
  742. Function fpc_Val_Real_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): ValReal; [public, alias:'FPC_VAL_REAL_UNICODESTR']; compilerproc;
  743. Var
  744. SS : String;
  745. begin
  746. fpc_Val_Real_UnicodeStr := 0;
  747. if length(S) > 255 then
  748. code := 256
  749. else
  750. begin
  751. SS := S;
  752. Val(SS,fpc_Val_Real_UnicodeStr,code);
  753. end;
  754. end;
  755. {$endif}
  756. function fpc_val_enum_unicodestr(str2ordindex:pointer;const s:unicodestring;out code:valsint):longint;compilerproc;
  757. var ss:shortstring;
  758. begin
  759. if length(s)>255 then
  760. code:=256
  761. else
  762. begin
  763. ss:=s;
  764. val(ss,fpc_val_enum_unicodestr,code);
  765. end;
  766. end;
  767. Function fpc_Val_Currency_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): Currency; [public, alias:'FPC_VAL_CURRENCY_UNICODESTR']; compilerproc;
  768. Var
  769. SS : String;
  770. begin
  771. if length(S) > 255 then
  772. begin
  773. fpc_Val_Currency_UnicodeStr:=0;
  774. code := 256;
  775. end
  776. else
  777. begin
  778. SS := S;
  779. Val(SS,fpc_Val_Currency_UnicodeStr,code);
  780. end;
  781. end;
  782. Function fpc_Val_UInt_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): ValUInt; [public, alias:'FPC_VAL_UINT_UNICODESTR']; compilerproc;
  783. Var
  784. SS : ShortString;
  785. begin
  786. fpc_Val_UInt_UnicodeStr := 0;
  787. if length(S) > 255 then
  788. code := 256
  789. else
  790. begin
  791. SS := S;
  792. Val(SS,fpc_Val_UInt_UnicodeStr,code);
  793. end;
  794. end;
  795. Function fpc_Val_SInt_UnicodeStr (DestSize: SizeInt; Const S : UnicodeString; out Code : ValSInt): ValSInt; [public, alias:'FPC_VAL_SINT_UNICODESTR']; compilerproc;
  796. Var
  797. SS : ShortString;
  798. begin
  799. fpc_Val_SInt_UnicodeStr:=0;
  800. if length(S)>255 then
  801. code:=256
  802. else
  803. begin
  804. SS := S;
  805. fpc_Val_SInt_UnicodeStr := int_Val_SInt_ShortStr(DestSize,SS,Code);
  806. end;
  807. end;
  808. {$ifndef CPU64}
  809. Function fpc_Val_qword_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): qword; [public, alias:'FPC_VAL_QWORD_UNICODESTR']; compilerproc;
  810. Var
  811. SS : ShortString;
  812. begin
  813. fpc_Val_qword_UnicodeStr:=0;
  814. if length(S)>255 then
  815. code:=256
  816. else
  817. begin
  818. SS := S;
  819. Val(SS,fpc_Val_qword_UnicodeStr,Code);
  820. end;
  821. end;
  822. Function fpc_Val_int64_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): Int64; [public, alias:'FPC_VAL_INT64_UNICODESTR']; compilerproc;
  823. Var
  824. SS : ShortString;
  825. begin
  826. fpc_Val_int64_UnicodeStr:=0;
  827. if length(S)>255 then
  828. code:=256
  829. else
  830. begin
  831. SS := S;
  832. Val(SS,fpc_Val_int64_UnicodeStr,Code);
  833. end;
  834. end;
  835. {$endif CPU64}
  836. {$ifndef FPUNONE}
  837. procedure fpc_UnicodeStr_Float(d : ValReal;len,fr,rt : SizeInt;out s : UnicodeString);compilerproc;
  838. var
  839. ss : shortstring;
  840. begin
  841. str_real(len,fr,d,treal_type(rt),ss);
  842. s:=ss;
  843. end;
  844. {$endif}
  845. procedure fpc_unicodestr_enum(ordinal,len:sizeint;typinfo,ord2strindex:pointer;out s:unicodestring);compilerproc;
  846. var ss:shortstring;
  847. begin
  848. fpc_shortstr_enum(ordinal,len,typinfo,ord2strindex,ss);
  849. s:=ss;
  850. end;
  851. procedure fpc_unicodestr_bool(b : boolean;len:sizeint;out s:unicodestring);compilerproc;
  852. var ss:shortstring;
  853. begin
  854. fpc_shortstr_bool(b,len,ss);
  855. s:=ss;
  856. end;
  857. {$ifdef FPC_HAS_STR_CURRENCY}
  858. procedure fpc_UnicodeStr_Currency(c : Currency;len,fr : SizeInt;out s : UnicodeString);compilerproc;
  859. var
  860. ss : shortstring;
  861. begin
  862. str(c:len:fr,ss);
  863. s:=ss;
  864. end;
  865. {$endif FPC_HAS_STR_CURRENCY}
  866. Procedure fpc_UnicodeStr_SInt(v : ValSint; Len : SizeInt; out S : UnicodeString);compilerproc;
  867. Var
  868. SS : ShortString;
  869. begin
  870. Str (v:Len,SS);
  871. S:=SS;
  872. end;
  873. Procedure fpc_UnicodeStr_UInt(v : ValUInt;Len : SizeInt; out S : UnicodeString);compilerproc;
  874. Var
  875. SS : ShortString;
  876. begin
  877. str(v:Len,SS);
  878. S:=SS;
  879. end;
  880. {$ifndef CPU64}
  881. Procedure fpc_UnicodeStr_Int64(v : Int64; Len : SizeInt; out S : UnicodeString);compilerproc;
  882. Var
  883. SS : ShortString;
  884. begin
  885. Str (v:Len,SS);
  886. S:=SS;
  887. end;
  888. Procedure fpc_UnicodeStr_Qword(v : Qword;Len : SizeInt; out S : UnicodeString);compilerproc;
  889. Var
  890. SS : ShortString;
  891. begin
  892. str(v:Len,SS);
  893. S:=SS;
  894. end;
  895. {$endif CPU64}
  896. *)
  897. (*
  898. { converts an utf-16 code point or surrogate pair to utf-32 }
  899. function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
  900. var
  901. w: unicodechar;
  902. begin
  903. { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
  904. { are the same in UTF-32 }
  905. w:=s[index];
  906. if (w<=#$d7ff) or
  907. (w>=#$e000) then
  908. begin
  909. result:=UCS4Char(w);
  910. len:=1;
  911. end
  912. { valid surrogate pair? }
  913. else if (w<=#$dbff) and
  914. { w>=#$d7ff check not needed, checked above }
  915. (index<length(s)) and
  916. (s[index+1]>=#$dc00) and
  917. (s[index+1]<=#$dfff) then
  918. { convert the surrogate pair to UTF-32 }
  919. begin
  920. result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
  921. len:=2;
  922. end
  923. else
  924. { invalid surrogate -> do nothing }
  925. begin
  926. result:=UCS4Char(w);
  927. len:=1;
  928. end;
  929. end;
  930. function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  931. begin
  932. if assigned(Source) then
  933. Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
  934. else
  935. Result:=0;
  936. end;
  937. function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar; SourceChars: SizeUInt): SizeUInt;
  938. var
  939. i,j : SizeUInt;
  940. w : word;
  941. lw : longword;
  942. len : longint;
  943. begin
  944. result:=0;
  945. if source=nil then
  946. exit;
  947. i:=0;
  948. j:=0;
  949. if assigned(Dest) then
  950. begin
  951. while (i<SourceChars) and (j<MaxDestBytes) do
  952. begin
  953. w:=word(Source[i]);
  954. case w of
  955. 0..$7f:
  956. begin
  957. Dest[j]:=char(w);
  958. inc(j);
  959. end;
  960. $80..$7ff:
  961. begin
  962. if j+1>=MaxDestBytes then
  963. break;
  964. Dest[j]:=char($c0 or (w shr 6));
  965. Dest[j+1]:=char($80 or (w and $3f));
  966. inc(j,2);
  967. end;
  968. $800..$d7ff,$e000..$ffff:
  969. begin
  970. if j+2>=MaxDestBytes then
  971. break;
  972. Dest[j]:=char($e0 or (w shr 12));
  973. Dest[j+1]:=char($80 or ((w shr 6) and $3f));
  974. Dest[j+2]:=char($80 or (w and $3f));
  975. inc(j,3);
  976. end;
  977. $d800..$dbff:
  978. {High Surrogates}
  979. begin
  980. if j+3>=MaxDestBytes then
  981. break;
  982. if (i<sourcechars-1) and
  983. (word(Source[i+1]) >= $dc00) and
  984. (word(Source[i+1]) <= $dfff) then
  985. begin
  986. lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
  987. Dest[j]:=char($f0 or (lw shr 18));
  988. Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
  989. Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
  990. Dest[j+3]:=char($80 or (lw and $3f));
  991. inc(j,4);
  992. inc(i);
  993. end;
  994. end;
  995. end;
  996. inc(i);
  997. end;
  998. if j>SizeUInt(MaxDestBytes-1) then
  999. j:=MaxDestBytes-1;
  1000. Dest[j]:=#0;
  1001. end
  1002. else
  1003. begin
  1004. while i<SourceChars do
  1005. begin
  1006. case word(Source[i]) of
  1007. $0..$7f:
  1008. inc(j);
  1009. $80..$7ff:
  1010. inc(j,2);
  1011. $800..$d7ff,$e000..$ffff:
  1012. inc(j,3);
  1013. $d800..$dbff:
  1014. begin
  1015. if (i<sourcechars-1) and
  1016. (word(Source[i+1]) >= $dc00) and
  1017. (word(Source[i+1]) <= $dfff) then
  1018. begin
  1019. inc(j,4);
  1020. inc(i);
  1021. end;
  1022. end;
  1023. end;
  1024. inc(i);
  1025. end;
  1026. end;
  1027. result:=j+1;
  1028. end;
  1029. function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1030. begin
  1031. if assigned(Source) then
  1032. Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
  1033. else
  1034. Result:=0;
  1035. end;
  1036. function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
  1037. const
  1038. UNICODE_INVALID=63;
  1039. var
  1040. InputUTF8: SizeUInt;
  1041. IBYTE: BYTE;
  1042. OutputUnicode: SizeUInt;
  1043. PRECHAR: SizeUInt;
  1044. TempBYTE: BYTE;
  1045. CharLen: SizeUint;
  1046. LookAhead: SizeUInt;
  1047. UC: SizeUInt;
  1048. begin
  1049. if not assigned(Source) then
  1050. begin
  1051. result:=0;
  1052. exit;
  1053. end;
  1054. result:=SizeUInt(-1);
  1055. InputUTF8:=0;
  1056. OutputUnicode:=0;
  1057. PreChar:=0;
  1058. if Assigned(Dest) Then
  1059. begin
  1060. while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
  1061. begin
  1062. IBYTE:=byte(Source[InputUTF8]);
  1063. if (IBYTE and $80) = 0 then
  1064. begin
  1065. //One character US-ASCII, convert it to unicode
  1066. if IBYTE = 10 then
  1067. begin
  1068. If (PreChar<>13) and FALSE then
  1069. begin
  1070. //Expand to crlf, conform UTF-8.
  1071. //This procedure will break the memory alocation by
  1072. //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
  1073. if OutputUnicode+1<MaxDestChars then
  1074. begin
  1075. Dest[OutputUnicode]:=WideChar(13);
  1076. inc(OutputUnicode);
  1077. Dest[OutputUnicode]:=WideChar(10);
  1078. inc(OutputUnicode);
  1079. PreChar:=10;
  1080. end
  1081. else
  1082. begin
  1083. Dest[OutputUnicode]:=WideChar(13);
  1084. inc(OutputUnicode);
  1085. end;
  1086. end
  1087. else
  1088. begin
  1089. Dest[OutputUnicode]:=WideChar(IBYTE);
  1090. inc(OutputUnicode);
  1091. PreChar:=IBYTE;
  1092. end;
  1093. end
  1094. else
  1095. begin
  1096. Dest[OutputUnicode]:=WideChar(IBYTE);
  1097. inc(OutputUnicode);
  1098. PreChar:=IBYTE;
  1099. end;
  1100. inc(InputUTF8);
  1101. end
  1102. else
  1103. begin
  1104. TempByte:=IBYTE;
  1105. CharLen:=0;
  1106. while (TempBYTE and $80)<>0 do
  1107. begin
  1108. TempBYTE:=(TempBYTE shl 1) and $FE;
  1109. inc(CharLen);
  1110. end;
  1111. //Test for the "CharLen" conforms UTF-8 string
  1112. //This means the 10xxxxxx pattern.
  1113. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
  1114. begin
  1115. //Insuficient chars in string to decode
  1116. //UTF-8 array. Fallback to single char.
  1117. CharLen:= 1;
  1118. end;
  1119. for LookAhead := 1 to CharLen-1 do
  1120. begin
  1121. if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
  1122. ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
  1123. begin
  1124. //Invalid UTF-8 sequence, fallback.
  1125. CharLen:= LookAhead;
  1126. break;
  1127. end;
  1128. end;
  1129. UC:=$FFFF;
  1130. case CharLen of
  1131. 1: begin
  1132. //Not valid UTF-8 sequence
  1133. UC:=UNICODE_INVALID;
  1134. end;
  1135. 2: begin
  1136. //Two bytes UTF, convert it
  1137. UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
  1138. UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
  1139. if UC <= $7F then
  1140. begin
  1141. //Invalid UTF sequence.
  1142. UC:=UNICODE_INVALID;
  1143. end;
  1144. end;
  1145. 3: begin
  1146. //Three bytes, convert it to unicode
  1147. UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
  1148. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
  1149. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
  1150. if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
  1151. begin
  1152. //Invalid UTF-8 sequence
  1153. UC:= UNICODE_INVALID;
  1154. End;
  1155. end;
  1156. 4: begin
  1157. //Four bytes, convert it to two unicode characters
  1158. UC:= (byte(Source[InputUTF8]) and $07) shl 18;
  1159. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
  1160. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
  1161. UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
  1162. if (UC < $10000) or (UC > $10FFFF) then
  1163. begin
  1164. UC:= UNICODE_INVALID;
  1165. end
  1166. else
  1167. begin
  1168. { only store pair if room }
  1169. dec(UC,$10000);
  1170. if (OutputUnicode<MaxDestChars-1) then
  1171. begin
  1172. Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
  1173. inc(OutputUnicode);
  1174. UC:=(UC and $3ff) + $DC00;
  1175. end
  1176. else
  1177. begin
  1178. InputUTF8:= InputUTF8 + CharLen;
  1179. { don't store anything }
  1180. CharLen:=0;
  1181. end;
  1182. end;
  1183. end;
  1184. 5,6,7: begin
  1185. //Invalid UTF8 to unicode conversion,
  1186. //mask it as invalid UNICODE too.
  1187. UC:=UNICODE_INVALID;
  1188. end;
  1189. end;
  1190. if CharLen > 0 then
  1191. begin
  1192. PreChar:=UC;
  1193. Dest[OutputUnicode]:=WideChar(UC);
  1194. inc(OutputUnicode);
  1195. end;
  1196. InputUTF8:= InputUTF8 + CharLen;
  1197. end;
  1198. end;
  1199. Result:=OutputUnicode+1;
  1200. end
  1201. else
  1202. begin
  1203. while (InputUTF8<SourceBytes) do
  1204. begin
  1205. IBYTE:=byte(Source[InputUTF8]);
  1206. if (IBYTE and $80) = 0 then
  1207. begin
  1208. //One character US-ASCII, convert it to unicode
  1209. if IBYTE = 10 then
  1210. begin
  1211. if (PreChar<>13) and FALSE then
  1212. begin
  1213. //Expand to crlf, conform UTF-8.
  1214. //This procedure will break the memory alocation by
  1215. //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
  1216. inc(OutputUnicode,2);
  1217. PreChar:=10;
  1218. end
  1219. else
  1220. begin
  1221. inc(OutputUnicode);
  1222. PreChar:=IBYTE;
  1223. end;
  1224. end
  1225. else
  1226. begin
  1227. inc(OutputUnicode);
  1228. PreChar:=IBYTE;
  1229. end;
  1230. inc(InputUTF8);
  1231. end
  1232. else
  1233. begin
  1234. TempByte:=IBYTE;
  1235. CharLen:=0;
  1236. while (TempBYTE and $80)<>0 do
  1237. begin
  1238. TempBYTE:=(TempBYTE shl 1) and $FE;
  1239. inc(CharLen);
  1240. end;
  1241. //Test for the "CharLen" conforms UTF-8 string
  1242. //This means the 10xxxxxx pattern.
  1243. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
  1244. begin
  1245. //Insuficient chars in string to decode
  1246. //UTF-8 array. Fallback to single char.
  1247. CharLen:= 1;
  1248. end;
  1249. for LookAhead := 1 to CharLen-1 do
  1250. begin
  1251. if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
  1252. ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
  1253. begin
  1254. //Invalid UTF-8 sequence, fallback.
  1255. CharLen:= LookAhead;
  1256. break;
  1257. end;
  1258. end;
  1259. UC:=$FFFF;
  1260. case CharLen of
  1261. 1: begin
  1262. //Not valid UTF-8 sequence
  1263. UC:=UNICODE_INVALID;
  1264. end;
  1265. 2: begin
  1266. //Two bytes UTF, convert it
  1267. UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
  1268. UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
  1269. if UC <= $7F then
  1270. begin
  1271. //Invalid UTF sequence.
  1272. UC:=UNICODE_INVALID;
  1273. end;
  1274. end;
  1275. 3: begin
  1276. //Three bytes, convert it to unicode
  1277. UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
  1278. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
  1279. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
  1280. If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
  1281. begin
  1282. //Invalid UTF-8 sequence
  1283. UC:= UNICODE_INVALID;
  1284. end;
  1285. end;
  1286. 4: begin
  1287. //Four bytes, convert it to two unicode characters
  1288. UC:= (byte(Source[InputUTF8]) and $07) shl 18;
  1289. UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
  1290. UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
  1291. UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
  1292. if (UC < $10000) or (UC > $10FFFF) then
  1293. UC:= UNICODE_INVALID
  1294. else
  1295. { extra character character }
  1296. inc(OutputUnicode);
  1297. end;
  1298. 5,6,7: begin
  1299. //Invalid UTF8 to unicode conversion,
  1300. //mask it as invalid UNICODE too.
  1301. UC:=UNICODE_INVALID;
  1302. end;
  1303. end;
  1304. if CharLen > 0 then
  1305. begin
  1306. PreChar:=UC;
  1307. inc(OutputUnicode);
  1308. end;
  1309. InputUTF8:= InputUTF8 + CharLen;
  1310. end;
  1311. end;
  1312. Result:=OutputUnicode+1;
  1313. end;
  1314. end;
  1315. function UTF8Encode(const s : Ansistring) : UTF8String; inline;
  1316. begin
  1317. Result:=UTF8Encode(UnicodeString(s));
  1318. end;
  1319. function UTF8Encode(const s : UnicodeString) : UTF8String;
  1320. var
  1321. i : SizeInt;
  1322. hs : UTF8String;
  1323. begin
  1324. result:='';
  1325. if s='' then
  1326. exit;
  1327. SetLength(hs,length(s)*3);
  1328. i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PUnicodeChar(s),length(s));
  1329. if i>0 then
  1330. begin
  1331. SetLength(hs,i-1);
  1332. result:=hs;
  1333. end;
  1334. end;
  1335. function UTF8Decode(const s : UTF8String): UnicodeString;
  1336. var
  1337. i : SizeInt;
  1338. hs : UnicodeString;
  1339. begin
  1340. result:='';
  1341. if s='' then
  1342. exit;
  1343. SetLength(hs,length(s));
  1344. i:=Utf8ToUnicode(PUnicodeChar(hs),length(hs)+1,pchar(s),length(s));
  1345. if i>0 then
  1346. begin
  1347. SetLength(hs,i-1);
  1348. result:=hs;
  1349. end;
  1350. end;
  1351. function AnsiToUtf8(const s : ansistring): UTF8String;{$ifdef SYSTEMINLINE}inline;{$endif}
  1352. begin
  1353. Result:=Utf8Encode(s);
  1354. end;
  1355. function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inline;{$endif}
  1356. begin
  1357. Result:=Utf8Decode(s);
  1358. end;
  1359. function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
  1360. var
  1361. i, slen,
  1362. destindex : SizeInt;
  1363. len : longint;
  1364. begin
  1365. slen:=length(s);
  1366. setlength(result,slen+1);
  1367. i:=1;
  1368. destindex:=0;
  1369. while (i<=slen) do
  1370. begin
  1371. result[destindex]:=utf16toutf32(s,i,len);
  1372. inc(destindex);
  1373. inc(i,len);
  1374. end;
  1375. { destindex <= slen (surrogate pairs may have been merged) }
  1376. { destindex+1 for terminating #0 (dynamic arrays are }
  1377. { implicitely filled with zero) }
  1378. setlength(result,destindex+1);
  1379. end;
  1380. { concatenates an utf-32 char to a unicodestring. S *must* be unique when entering. }
  1381. procedure ConcatUTF32ToUnicodeStr(const nc: UCS4Char; var S: UnicodeString; var index: SizeInt);
  1382. var
  1383. p : PUnicodeChar;
  1384. begin
  1385. { if nc > $ffff, we need two places }
  1386. if (index+ord(nc > $ffff)>length(s)) then
  1387. if (length(s) < 10*256) then
  1388. setlength(s,length(s)+10)
  1389. else
  1390. setlength(s,length(s)+length(s) shr 8);
  1391. { we know that s is unique -> avoid uniquestring calls}
  1392. p:=@s[index];
  1393. if (nc<$ffff) then
  1394. begin
  1395. p^:=unicodechar(nc);
  1396. inc(index);
  1397. end
  1398. else if (dword(nc)<=$10ffff) then
  1399. begin
  1400. p^:=unicodechar((nc - $10000) shr 10 + $d800);
  1401. (p+1)^:=unicodechar((nc - $10000) and $3ff + $dc00);
  1402. inc(index,2);
  1403. end
  1404. else
  1405. { invalid code point }
  1406. begin
  1407. p^:='?';
  1408. inc(index);
  1409. end;
  1410. end;
  1411. function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString;
  1412. var
  1413. i : SizeInt;
  1414. resindex : SizeInt;
  1415. begin
  1416. { skip terminating #0 }
  1417. SetLength(result,length(s)-1);
  1418. resindex:=1;
  1419. for i:=0 to high(s)-1 do
  1420. ConcatUTF32ToUnicodeStr(s[i],result,resindex);
  1421. { adjust result length (may be too big due to growing }
  1422. { for surrogate pairs) }
  1423. setlength(result,resindex-1);
  1424. end;
  1425. function WideStringToUCS4String(const s : WideString) : UCS4String;
  1426. var
  1427. i, slen,
  1428. destindex : SizeInt;
  1429. len : longint;
  1430. begin
  1431. slen:=length(s);
  1432. setlength(result,slen+1);
  1433. i:=1;
  1434. destindex:=0;
  1435. while (i<=slen) do
  1436. begin
  1437. result[destindex]:=utf16toutf32(s,i,len);
  1438. inc(destindex);
  1439. inc(i,len);
  1440. end;
  1441. { destindex <= slen (surrogate pairs may have been merged) }
  1442. { destindex+1 for terminating #0 (dynamic arrays are }
  1443. { implicitely filled with zero) }
  1444. setlength(result,destindex+1);
  1445. end;
  1446. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  1447. procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
  1448. var
  1449. p : PWideChar;
  1450. begin
  1451. { if nc > $ffff, we need two places }
  1452. if (index+ord(nc > $ffff)>length(s)) then
  1453. if (length(s) < 10*256) then
  1454. setlength(s,length(s)+10)
  1455. else
  1456. setlength(s,length(s)+length(s) shr 8);
  1457. { we know that s is unique -> avoid uniquestring calls}
  1458. p:=@s[index];
  1459. if (nc<$ffff) then
  1460. begin
  1461. p^:=widechar(nc);
  1462. inc(index);
  1463. end
  1464. else if (dword(nc)<=$10ffff) then
  1465. begin
  1466. p^:=widechar((nc - $10000) shr 10 + $d800);
  1467. (p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
  1468. inc(index,2);
  1469. end
  1470. else
  1471. { invalid code point }
  1472. begin
  1473. p^:='?';
  1474. inc(index);
  1475. end;
  1476. end;
  1477. function UCS4StringToWideString(const s : UCS4String) : WideString;
  1478. var
  1479. i : SizeInt;
  1480. resindex : SizeInt;
  1481. begin
  1482. { skip terminating #0 }
  1483. SetLength(result,length(s)-1);
  1484. resindex:=1;
  1485. for i:=0 to high(s)-1 do
  1486. ConcatUTF32ToWideStr(s[i],result,resindex);
  1487. { adjust result length (may be too big due to growing }
  1488. { for surrogate pairs) }
  1489. setlength(result,resindex-1);
  1490. end;
  1491. const
  1492. SNoUnicodestrings = 'This binary has no unicodestrings support compiled in.';
  1493. SRecompileWithUnicodestrings = 'Recompile the application with a unicodestrings-manager in the program uses clause.';
  1494. *)
  1495. function CompareUnicodeString(const s1, s2 : UnicodeString) : PtrInt;
  1496. begin
  1497. widestringmanager.collator.setStrength(JTCollator.IDENTICAL);
  1498. result:=widestringmanager.collator.compare(s1,s2);
  1499. end;
  1500. function CompareTextUnicodeString(const s1, s2 : UnicodeString): PtrInt;
  1501. begin
  1502. widestringmanager.collator.setStrength(JTCollator.TERTIARY);
  1503. result:=widestringmanager.collator.compare(s1,s2);
  1504. end;
  1505. constructor TUnicodeStringManager.create;
  1506. begin
  1507. end;
  1508. procedure initunicodestringmanager;
  1509. begin
  1510. widestringmanager:=TUnicodeStringManager.create;
  1511. widestringmanager.collator:=JTCollator.getInstance;
  1512. end;