tw13075.pp 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. {$mode objfpc}{$H+}
  2. uses
  3. {$ifdef unix}
  4. cwstring,
  5. {$endif}
  6. Classes, SysUtils;
  7. function localUnicodeToUTF8(u: cardinal; Buf: PChar): integer;
  8. procedure RaiseInvalidUnicode;
  9. begin
  10. raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(u));
  11. end;
  12. begin
  13. case u of
  14. 0..$7f:
  15. begin
  16. Result:=1;
  17. Buf[0]:=char(byte(u));
  18. end;
  19. $80..$7ff:
  20. begin
  21. Result:=2;
  22. Buf[0]:=char(byte($c0 or (u shr 6)));
  23. Buf[1]:=char(byte($80 or (u and $3f)));
  24. end;
  25. $800..$ffff:
  26. begin
  27. Result:=3;
  28. Buf[0]:=char(byte($e0 or (u shr 12)));
  29. Buf[1]:=char(byte((u shr 6) and $3f) or $80);
  30. Buf[2]:=char(byte(u and $3f) or $80);
  31. end;
  32. $10000..$10ffff:
  33. begin
  34. Result:=4;
  35. Buf[0]:=char(byte($f0 or (u shr 18)));
  36. Buf[1]:=char(byte((u shr 12) and $3f) or $80);
  37. Buf[2]:=char(byte((u shr 6) and $3f) or $80);
  38. Buf[3]:=char(byte(u and $3f) or $80);
  39. end;
  40. else
  41. RaiseInvalidUnicode;
  42. end;
  43. end;
  44. function localUnicodeToUTF8(u: cardinal): shortstring;
  45. begin
  46. Result[0]:=chr(localUnicodeToUTF8(u,@Result[1]));
  47. end;
  48. function localUnicodeToUTF16(u: cardinal): widestring;
  49. begin
  50. // u should be <= $10FFFF to fit into UTF-16
  51. if u < $10000 then
  52. // Note: codepoints $D800 - $DFFF are reserved
  53. Result:=widechar(u)
  54. else
  55. Result:=widechar($D800+((u - $10000) shr 10))+widechar($DC00+((u - $10000) and $3ff));
  56. end;
  57. function UnicodeToCESU8(u: cardinal; Buf: PChar): integer;
  58. procedure RaiseInvalidUnicode;
  59. begin
  60. raise Exception.Create('UnicodeToCESU8: invalid unicode: '+IntToStr(u));
  61. end;
  62. var
  63. st: widestring;
  64. begin
  65. case u of
  66. 0..$ffff:
  67. begin
  68. Result:=localUnicodeToUTF8(u,Buf);
  69. end;
  70. $10000..$10ffff:
  71. begin
  72. st := localUnicodeToUTF16(u);
  73. Result:=6;
  74. Buf[0]:=char(byte($e0 or (ord(st[1]) shr 12)));
  75. Buf[1]:=char(byte((ord(st[1]) shr 6) and $3f) or $80);
  76. Buf[2]:=char(byte(ord(st[1]) and $3f) or $80);
  77. Buf[3]:=char(byte($e0 or (ord(st[2]) shr 12)));
  78. Buf[4]:=char(byte((ord(st[2]) shr 6) and $3f) or $80);
  79. Buf[5]:=char(byte(ord(st[2]) and $3f) or $80);
  80. end;
  81. else
  82. RaiseInvalidUnicode;
  83. end;
  84. end;
  85. function UnicodeToCESU8(u: cardinal): utf8string;
  86. begin
  87. setlength(result,1000);
  88. setlength(result,UnicodeToCESU8(u,@Result[1]));
  89. end;
  90. procedure dotest;
  91. var
  92. s1,s2: utf8string;
  93. w1,w2: unicodestring;
  94. s3,s4: utf8string;
  95. i: longint;
  96. begin
  97. s1 := localUnicodeToUTF8 ($10300);
  98. s2 := UnicodeToCESU8 ($10300);
  99. setlength(w1,20);
  100. setlength(w2,20);
  101. // -1 because UTF8ToUnicode returns a null-terminated string
  102. setlength(w1,UTF8ToUnicode(punicodechar(@w1[1]),length(w1),pchar(s1),Length(s1))-1);
  103. setlength(w2,UTF8ToUnicode(punicodechar(@w2[1]),length(w2),pchar(s2),Length(s2))-1);
  104. (*
  105. writeln('len: ',length(w1),' - "',w1,'"');
  106. write(' ');
  107. for i:= 1 to length(w1) do
  108. write('#$',hexstr(ord(w1[i]),4));
  109. writeln;
  110. writeln('len: ',length(w2),' - "',w2,'"');
  111. write(' ');
  112. for i:= 1 to length(w2) do
  113. write('#$',hexstr(ord(w2[i]),4));
  114. writeln;
  115. writeln;
  116. *)
  117. setlength(s3,20);
  118. setlength(s4,20);
  119. // -1 because UnicodeToUTF8 returns a null-terminated string
  120. setlength(s3,UnicodeToUTF8(@s3[1],length(s3),punicodechar(@w1[1]),length(w1))-1);
  121. setlength(s4,UnicodeToUTF8(@s4[1],length(s4),punicodechar(@w2[1]),length(w2))-1);
  122. if (s3<>s1) or
  123. { invalid: CESU-8 }
  124. (w2<>'??') or
  125. (s4<>'??') then
  126. begin
  127. writeln('len: ',length(s1),' - "',s1,'"');
  128. write(' ');
  129. for i:= 1 to length(s1) do
  130. write('#$',hexstr(ord(s1[i]),2));
  131. writeln;
  132. writeln('len: ',length(s2),' - "',s2,'"');
  133. write(' ');
  134. for i:= 1 to length(s2) do
  135. write('#$',hexstr(ord(s2[i]),2));
  136. writeln;
  137. writeln('len: ',length(s3),' - "',s3,'"');
  138. write(' ');
  139. for i:= 1 to length(s3) do
  140. write('#$',hexstr(ord(s3[i]),2));
  141. writeln;
  142. writeln('len: ',length(s4),' - "',s4,'"');
  143. write(' ');
  144. for i:= 1 to length(s4) do
  145. write('#$',hexstr(ord(s4[i]),2));
  146. writeln;
  147. halt(1);
  148. end;
  149. end;
  150. begin
  151. dotest;
  152. end.