testnfd.lpr 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. program testnfd;
  2. {
  3. This program tests the "NormalizeNFD" with the Unicode provided test file.
  4. The test file "NormalizationTest.txt" is to find in the Unicode Character
  5. Database.
  6. }
  7. {$mode objfpc}{$H+}
  8. uses
  9. SysUtils, Classes, Math, unicodedata;
  10. type
  11. TDataPartLine = record
  12. c1, c2, c3, c4, c5 : UCS4String;
  13. end;
  14. PDataPartLine = ^TDataPartLine;
  15. TDataPart = record
  16. Part : AnsiString;
  17. Lines : array of TDataPartLine;
  18. ActualLength : Integer;
  19. end;
  20. PDataPart = ^TDataPart;
  21. const
  22. LINE_LENGTH = 1024;
  23. DEFAULT_DATA_LINE_LENGTH = 25000;
  24. var
  25. p : PAnsiChar;
  26. bufferLength, bufferPos, lineLength, linePos : Integer;
  27. line : ansistring;
  28. totalErrorCount : Integer = 0;
  29. lineCount, errorCount : Integer;
  30. stream : TMemoryStream;
  31. part : ansistring;
  32. c1, c2, c3, c4, c5 : UCS4String;
  33. s1, s2, s3, s4, s5 : UnicodeString;
  34. dataList : array of TDataPart;
  35. dataListActualLength : Integer;
  36. pp, part1 : PDataPart;
  37. function NextLine() : Boolean;
  38. var
  39. locOldPos : Integer;
  40. locOldPointer : PAnsiChar;
  41. begin
  42. Result := False;
  43. locOldPointer := p;
  44. locOldPos := bufferPos;
  45. while (bufferPos < bufferLength) and (p^ <> #10) do begin
  46. Inc(p);
  47. Inc(bufferPos);
  48. end;
  49. if (locOldPos = bufferPos) and (p^ = #10) then begin
  50. lineLength := 0;
  51. Inc(p);
  52. Inc(bufferPos);
  53. linePos := 1;
  54. Result := True;
  55. end else if (locOldPos < bufferPos) then begin
  56. lineLength := (bufferPos - locOldPos) + 1;
  57. Move(locOldPointer^,line[1],lineLength);
  58. if (p^ = #10) then begin
  59. Dec(lineLength);
  60. Inc(p);
  61. Inc(bufferPos);
  62. end;
  63. linePos := 1;
  64. Result := True;
  65. end;
  66. if Result then
  67. Inc(lineCount);
  68. end;
  69. procedure SkipSpace();
  70. begin
  71. while (linePos < lineLength) and (line[linePos] in [' ',#9]) do
  72. Inc(linePos);
  73. end;
  74. function NextToken() : ansistring;
  75. const C_SEPARATORS = [';','#','.','[',']','*','@'];
  76. var
  77. k : Integer;
  78. begin
  79. SkipSpace();
  80. k := linePos;
  81. if (linePos <= lineLength) and (line[linePos] in C_SEPARATORS) then begin
  82. Result := line[linePos];
  83. Inc(linePos);
  84. exit;
  85. end;
  86. while (linePos <= lineLength) and not(line[linePos] in (C_SEPARATORS+[' '])) do
  87. Inc(linePos);
  88. if (linePos > k) then begin
  89. if (line[Min(linePos,lineLength)] in C_SEPARATORS) then
  90. Result := Copy(line,k,(linePos-k))
  91. else
  92. Result := Copy(line,k,(linePos-k+1));
  93. Result := Trim(Result);
  94. end else begin
  95. Result := '';
  96. end;
  97. end;
  98. function ParseLine() : Integer;
  99. var
  100. t : UCS4String;
  101. r : array[0..23] of UCS4String;
  102. rc, k : Integer;
  103. s : ansistring;
  104. begin
  105. rc := 0;
  106. SetLength(c1,0);
  107. SetLength(c2,0);
  108. SetLength(c3,0);
  109. SetLength(c4,0);
  110. SetLength(c5,0);
  111. SetLength(t,0);
  112. while (rc < Length(r)) do begin
  113. s := NextToken();
  114. if (s = '#') then
  115. break;
  116. if (s = '@') then begin
  117. part := NextToken();
  118. rc := 0;
  119. continue;
  120. end;
  121. if (s = '') or (s[1] = '#') then
  122. Break;
  123. if (s <> ';') then begin
  124. k := Length(t);
  125. SetLength(t,(k+1));
  126. t[k] := StrToInt('$' + s);
  127. end else if (s = ';') then begin
  128. k := Length(t);
  129. SetLength(t,(k+1));
  130. t[k] := 0;
  131. r[rc] := Copy(t);
  132. SetLength(t,0);
  133. Inc(rc);
  134. end;
  135. end;
  136. c1 := r[0]; s1 := UCS4StringToUnicodeString(c1);
  137. c2 := r[1]; s2 := UCS4StringToUnicodeString(c2);
  138. c3 := r[2]; s3 := UCS4StringToUnicodeString(c3);
  139. c4 := r[3]; s4 := UCS4StringToUnicodeString(c4);
  140. c5 := r[4]; s5 := UCS4StringToUnicodeString(c5);
  141. Result := rc;
  142. end;
  143. procedure AddDataLine();
  144. var
  145. k : Integer;
  146. p : PDataPart;
  147. pline : PDataPartLine;
  148. begin
  149. p := nil;
  150. for k := Low(dataList) to High(dataList) do begin
  151. if (dataList[k].Part = part) then begin
  152. p := @dataList[k];
  153. break;
  154. end;
  155. end;
  156. if (p = nil) then begin
  157. k := dataListActualLength;
  158. if (k >= Length(dataList)) then
  159. SetLength(dataList,(k+5));
  160. dataListActualLength := k+1;
  161. p := @dataList[k];
  162. p^.Part := part;
  163. end;
  164. k := p^.ActualLength;
  165. if (k >= Length(p^.Lines)) then
  166. SetLength(p^.Lines,(k+DEFAULT_DATA_LINE_LENGTH));
  167. pline := @p^.Lines[k];
  168. pline^.c1 := c1;
  169. pline^.c2 := c2;
  170. pline^.c3 := c3;
  171. pline^.c4 := c4;
  172. pline^.c5 := c5;
  173. p^.ActualLength := k+1;
  174. c1 := nil;
  175. c2 := nil;
  176. c3 := nil;
  177. c4 := nil;
  178. c5 := nil;
  179. end;
  180. function IsInPart(ACodePoint : UCS4Char; APart : PDataPart) : boolean;
  181. var
  182. k : Integer;
  183. pline : PDataPartLine;
  184. begin
  185. pline := @APart^.Lines[0];
  186. for k := 0 to APart^.ActualLength-1 do begin
  187. if (Length(pline^.c1) = 2) and (pline^.c1[0] = ACodePoint) then
  188. exit(True);
  189. Inc(pline);
  190. end;
  191. Result := False;
  192. end;
  193. procedure Prepare();
  194. begin
  195. bufferLength := stream.Size;
  196. bufferPos := 0;
  197. p := stream.Memory;
  198. lineLength := 0;
  199. SetLength(line,LINE_LENGTH);
  200. SetLength(dataList,10);
  201. dataListActualLength := 0;
  202. end;
  203. procedure TestLines();
  204. var
  205. lineErrors : Integer;
  206. begin
  207. while NextLine() do begin
  208. if (ParseLine() < 5) then
  209. continue;
  210. AddDataLine();
  211. lineErrors := 0;
  212. //c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
  213. if (NormalizeNFD(s1) <> s3) then
  214. lineErrors := lineErrors+1;
  215. if (NormalizeNFD(s2) <> s3) then
  216. lineErrors := lineErrors+1;
  217. if (NormalizeNFD(s3) <> s3) then
  218. Inc(errorCount);
  219. //c5 == toNFD(c4) == toNFD(c5)
  220. if (NormalizeNFD(s4) <> s5) then
  221. lineErrors := lineErrors+1;
  222. if (NormalizeNFD(s5) <> s5) then
  223. lineErrors := lineErrors+1;
  224. if (lineErrors <> 0) then
  225. errorCount := errorCount+lineErrors;
  226. end;
  227. end;
  228. {$IFDEF ALL_CODE_POINTS}
  229. procedure TestBmpCodePoints();
  230. var
  231. cp : Word;
  232. s : UnicodeString;
  233. pu : PUC_Prop;
  234. begin
  235. SetLength(s,1);
  236. for cp := Low(Word) to High(Word) do begin
  237. pu := GetProps(cp);
  238. if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
  239. not(IsInPart(cp,part1))
  240. then begin
  241. //X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
  242. PWord(@s[1])^ := cp;
  243. if (NormalizeNFD(s) <> s) then
  244. errorCount := errorCount+1;
  245. end;
  246. end;
  247. end;
  248. procedure TestOBmpCodePoints();
  249. var
  250. cp : UCS4Char;
  251. s : UnicodeString;
  252. pu : PUC_Prop;
  253. begin
  254. SetLength(s,2);
  255. s[1] := 'a'; s[2] := 'a';
  256. for cp := High(Word)+1 to MAX_LEGAL_UTF32 do begin
  257. pu := GetProps(cp);
  258. if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
  259. not(IsInPart(cp,part1))
  260. then begin
  261. //X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
  262. FromUCS4(cp,s[1],s[2]);
  263. if (NormalizeNFD(s) <> s) then
  264. errorCount := errorCount+1;
  265. end;
  266. end;
  267. end;
  268. {$ENDIF ALL_CODE_POINTS}
  269. var
  270. i, c : Integer;
  271. begin
  272. errorCount := 0;
  273. lineCount := 0;
  274. stream := TMemoryStream.Create();
  275. try
  276. stream.LoadFromFile('NormalizationTest.txt');
  277. Prepare();
  278. // Direct tests specified in NormalizationTest.txt
  279. TestLines();
  280. part1 := nil;
  281. c := 0;
  282. for i := 0 to dataListActualLength-1 do begin
  283. pp := @dataList[i];
  284. if (Length(pp^.Lines) <> pp^.ActualLength) then
  285. SetLength(pp^.Lines,pp^.ActualLength);
  286. c := c+pp^.ActualLength;
  287. if SameText(pp^.Part,'Part1') then
  288. part1 := pp;
  289. end;
  290. if (part1 = nil) then
  291. raise Exception.Create('"Part1" not found !');
  292. { $DEFINE ALL_CODE_POINTS}
  293. {$IFDEF ALL_CODE_POINTS}
  294. // Tests for BMP Codepoints not is PART1
  295. TestBmpCodePoints();
  296. // Tests for BMP Codepoints not is PART1
  297. TestOBmpCodePoints();
  298. {$ENDIF ALL_CODE_POINTS}
  299. WriteLn('Line Count = ',lineCount);
  300. WriteLn('Actual Test Line Count = ',c);
  301. WriteLn('Error Count = ',errorCount);
  302. Inc(totalErrorCount,errorCount);
  303. finally
  304. stream.Free();
  305. end;
  306. if (totalErrorCount > 0) then begin
  307. WriteLn('Failed.');
  308. Halt(1);
  309. end;
  310. end.