encodingstest.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. #include "unittest.h"
  2. #include "rapidjson/filereadstream.h"
  3. #include "rapidjson/filewritestream.h"
  4. #include "rapidjson/encodedstream.h"
  5. #include "rapidjson/stringbuffer.h"
  6. using namespace rapidjson;
  7. // Verification of encoders/decoders with Hoehrmann's UTF8 decoder
  8. // http://www.unicode.org/Public/UNIDATA/Blocks.txt
  9. static const unsigned kCodepointRanges[] = {
  10. 0x0000, 0x007F, // Basic Latin
  11. 0x0080, 0x00FF, // Latin-1 Supplement
  12. 0x0100, 0x017F, // Latin Extended-A
  13. 0x0180, 0x024F, // Latin Extended-B
  14. 0x0250, 0x02AF, // IPA Extensions
  15. 0x02B0, 0x02FF, // Spacing Modifier Letters
  16. 0x0300, 0x036F, // Combining Diacritical Marks
  17. 0x0370, 0x03FF, // Greek and Coptic
  18. 0x0400, 0x04FF, // Cyrillic
  19. 0x0500, 0x052F, // Cyrillic Supplement
  20. 0x0530, 0x058F, // Armenian
  21. 0x0590, 0x05FF, // Hebrew
  22. 0x0600, 0x06FF, // Arabic
  23. 0x0700, 0x074F, // Syriac
  24. 0x0750, 0x077F, // Arabic Supplement
  25. 0x0780, 0x07BF, // Thaana
  26. 0x07C0, 0x07FF, // NKo
  27. 0x0800, 0x083F, // Samaritan
  28. 0x0840, 0x085F, // Mandaic
  29. 0x0900, 0x097F, // Devanagari
  30. 0x0980, 0x09FF, // Bengali
  31. 0x0A00, 0x0A7F, // Gurmukhi
  32. 0x0A80, 0x0AFF, // Gujarati
  33. 0x0B00, 0x0B7F, // Oriya
  34. 0x0B80, 0x0BFF, // Tamil
  35. 0x0C00, 0x0C7F, // Telugu
  36. 0x0C80, 0x0CFF, // Kannada
  37. 0x0D00, 0x0D7F, // Malayalam
  38. 0x0D80, 0x0DFF, // Sinhala
  39. 0x0E00, 0x0E7F, // Thai
  40. 0x0E80, 0x0EFF, // Lao
  41. 0x0F00, 0x0FFF, // Tibetan
  42. 0x1000, 0x109F, // Myanmar
  43. 0x10A0, 0x10FF, // Georgian
  44. 0x1100, 0x11FF, // Hangul Jamo
  45. 0x1200, 0x137F, // Ethiopic
  46. 0x1380, 0x139F, // Ethiopic Supplement
  47. 0x13A0, 0x13FF, // Cherokee
  48. 0x1400, 0x167F, // Unified Canadian Aboriginal Syllabics
  49. 0x1680, 0x169F, // Ogham
  50. 0x16A0, 0x16FF, // Runic
  51. 0x1700, 0x171F, // Tagalog
  52. 0x1720, 0x173F, // Hanunoo
  53. 0x1740, 0x175F, // Buhid
  54. 0x1760, 0x177F, // Tagbanwa
  55. 0x1780, 0x17FF, // Khmer
  56. 0x1800, 0x18AF, // Mongolian
  57. 0x18B0, 0x18FF, // Unified Canadian Aboriginal Syllabics Extended
  58. 0x1900, 0x194F, // Limbu
  59. 0x1950, 0x197F, // Tai Le
  60. 0x1980, 0x19DF, // New Tai Lue
  61. 0x19E0, 0x19FF, // Khmer Symbols
  62. 0x1A00, 0x1A1F, // Buginese
  63. 0x1A20, 0x1AAF, // Tai Tham
  64. 0x1B00, 0x1B7F, // Balinese
  65. 0x1B80, 0x1BBF, // Sundanese
  66. 0x1BC0, 0x1BFF, // Batak
  67. 0x1C00, 0x1C4F, // Lepcha
  68. 0x1C50, 0x1C7F, // Ol Chiki
  69. 0x1CD0, 0x1CFF, // Vedic Extensions
  70. 0x1D00, 0x1D7F, // Phonetic Extensions
  71. 0x1D80, 0x1DBF, // Phonetic Extensions Supplement
  72. 0x1DC0, 0x1DFF, // Combining Diacritical Marks Supplement
  73. 0x1E00, 0x1EFF, // Latin Extended Additional
  74. 0x1F00, 0x1FFF, // Greek Extended
  75. 0x2000, 0x206F, // General Punctuation
  76. 0x2070, 0x209F, // Superscripts and Subscripts
  77. 0x20A0, 0x20CF, // Currency Symbols
  78. 0x20D0, 0x20FF, // Combining Diacritical Marks for Symbols
  79. 0x2100, 0x214F, // Letterlike Symbols
  80. 0x2150, 0x218F, // Number Forms
  81. 0x2190, 0x21FF, // Arrows
  82. 0x2200, 0x22FF, // Mathematical Operators
  83. 0x2300, 0x23FF, // Miscellaneous Technical
  84. 0x2400, 0x243F, // Control Pictures
  85. 0x2440, 0x245F, // Optical Character Recognition
  86. 0x2460, 0x24FF, // Enclosed Alphanumerics
  87. 0x2500, 0x257F, // Box Drawing
  88. 0x2580, 0x259F, // Block Elements
  89. 0x25A0, 0x25FF, // Geometric Shapes
  90. 0x2600, 0x26FF, // Miscellaneous Symbols
  91. 0x2700, 0x27BF, // Dingbats
  92. 0x27C0, 0x27EF, // Miscellaneous Mathematical Symbols-A
  93. 0x27F0, 0x27FF, // Supplemental Arrows-A
  94. 0x2800, 0x28FF, // Braille Patterns
  95. 0x2900, 0x297F, // Supplemental Arrows-B
  96. 0x2980, 0x29FF, // Miscellaneous Mathematical Symbols-B
  97. 0x2A00, 0x2AFF, // Supplemental Mathematical Operators
  98. 0x2B00, 0x2BFF, // Miscellaneous Symbols and Arrows
  99. 0x2C00, 0x2C5F, // Glagolitic
  100. 0x2C60, 0x2C7F, // Latin Extended-C
  101. 0x2C80, 0x2CFF, // Coptic
  102. 0x2D00, 0x2D2F, // Georgian Supplement
  103. 0x2D30, 0x2D7F, // Tifinagh
  104. 0x2D80, 0x2DDF, // Ethiopic Extended
  105. 0x2DE0, 0x2DFF, // Cyrillic Extended-A
  106. 0x2E00, 0x2E7F, // Supplemental Punctuation
  107. 0x2E80, 0x2EFF, // CJK Radicals Supplement
  108. 0x2F00, 0x2FDF, // Kangxi Radicals
  109. 0x2FF0, 0x2FFF, // Ideographic Description Characters
  110. 0x3000, 0x303F, // CJK Symbols and Punctuation
  111. 0x3040, 0x309F, // Hiragana
  112. 0x30A0, 0x30FF, // Katakana
  113. 0x3100, 0x312F, // Bopomofo
  114. 0x3130, 0x318F, // Hangul Compatibility Jamo
  115. 0x3190, 0x319F, // Kanbun
  116. 0x31A0, 0x31BF, // Bopomofo Extended
  117. 0x31C0, 0x31EF, // CJK Strokes
  118. 0x31F0, 0x31FF, // Katakana Phonetic Extensions
  119. 0x3200, 0x32FF, // Enclosed CJK Letters and Months
  120. 0x3300, 0x33FF, // CJK Compatibility
  121. 0x3400, 0x4DBF, // CJK Unified Ideographs Extension A
  122. 0x4DC0, 0x4DFF, // Yijing Hexagram Symbols
  123. 0x4E00, 0x9FFF, // CJK Unified Ideographs
  124. 0xA000, 0xA48F, // Yi Syllables
  125. 0xA490, 0xA4CF, // Yi Radicals
  126. 0xA4D0, 0xA4FF, // Lisu
  127. 0xA500, 0xA63F, // Vai
  128. 0xA640, 0xA69F, // Cyrillic Extended-B
  129. 0xA6A0, 0xA6FF, // Bamum
  130. 0xA700, 0xA71F, // Modifier Tone Letters
  131. 0xA720, 0xA7FF, // Latin Extended-D
  132. 0xA800, 0xA82F, // Syloti Nagri
  133. 0xA830, 0xA83F, // Common Indic Number Forms
  134. 0xA840, 0xA87F, // Phags-pa
  135. 0xA880, 0xA8DF, // Saurashtra
  136. 0xA8E0, 0xA8FF, // Devanagari Extended
  137. 0xA900, 0xA92F, // Kayah Li
  138. 0xA930, 0xA95F, // Rejang
  139. 0xA960, 0xA97F, // Hangul Jamo Extended-A
  140. 0xA980, 0xA9DF, // Javanese
  141. 0xAA00, 0xAA5F, // Cham
  142. 0xAA60, 0xAA7F, // Myanmar Extended-A
  143. 0xAA80, 0xAADF, // Tai Viet
  144. 0xAB00, 0xAB2F, // Ethiopic Extended-A
  145. 0xABC0, 0xABFF, // Meetei Mayek
  146. 0xAC00, 0xD7AF, // Hangul Syllables
  147. 0xD7B0, 0xD7FF, // Hangul Jamo Extended-B
  148. //0xD800, 0xDB7F, // High Surrogates
  149. //0xDB80, 0xDBFF, // High Private Use Surrogates
  150. //0xDC00, 0xDFFF, // Low Surrogates
  151. 0xE000, 0xF8FF, // Private Use Area
  152. 0xF900, 0xFAFF, // CJK Compatibility Ideographs
  153. 0xFB00, 0xFB4F, // Alphabetic Presentation Forms
  154. 0xFB50, 0xFDFF, // Arabic Presentation Forms-A
  155. 0xFE00, 0xFE0F, // Variation Selectors
  156. 0xFE10, 0xFE1F, // Vertical Forms
  157. 0xFE20, 0xFE2F, // Combining Half Marks
  158. 0xFE30, 0xFE4F, // CJK Compatibility Forms
  159. 0xFE50, 0xFE6F, // Small Form Variants
  160. 0xFE70, 0xFEFF, // Arabic Presentation Forms-B
  161. 0xFF00, 0xFFEF, // Halfwidth and Fullwidth Forms
  162. 0xFFF0, 0xFFFF, // Specials
  163. 0x10000, 0x1007F, // Linear B Syllabary
  164. 0x10080, 0x100FF, // Linear B Ideograms
  165. 0x10100, 0x1013F, // Aegean Numbers
  166. 0x10140, 0x1018F, // Ancient Greek Numbers
  167. 0x10190, 0x101CF, // Ancient Symbols
  168. 0x101D0, 0x101FF, // Phaistos Disc
  169. 0x10280, 0x1029F, // Lycian
  170. 0x102A0, 0x102DF, // Carian
  171. 0x10300, 0x1032F, // Old Italic
  172. 0x10330, 0x1034F, // Gothic
  173. 0x10380, 0x1039F, // Ugaritic
  174. 0x103A0, 0x103DF, // Old Persian
  175. 0x10400, 0x1044F, // Deseret
  176. 0x10450, 0x1047F, // Shavian
  177. 0x10480, 0x104AF, // Osmanya
  178. 0x10800, 0x1083F, // Cypriot Syllabary
  179. 0x10840, 0x1085F, // Imperial Aramaic
  180. 0x10900, 0x1091F, // Phoenician
  181. 0x10920, 0x1093F, // Lydian
  182. 0x10A00, 0x10A5F, // Kharoshthi
  183. 0x10A60, 0x10A7F, // Old South Arabian
  184. 0x10B00, 0x10B3F, // Avestan
  185. 0x10B40, 0x10B5F, // Inscriptional Parthian
  186. 0x10B60, 0x10B7F, // Inscriptional Pahlavi
  187. 0x10C00, 0x10C4F, // Old Turkic
  188. 0x10E60, 0x10E7F, // Rumi Numeral Symbols
  189. 0x11000, 0x1107F, // Brahmi
  190. 0x11080, 0x110CF, // Kaithi
  191. 0x12000, 0x123FF, // Cuneiform
  192. 0x12400, 0x1247F, // Cuneiform Numbers and Punctuation
  193. 0x13000, 0x1342F, // Egyptian Hieroglyphs
  194. 0x16800, 0x16A3F, // Bamum Supplement
  195. 0x1B000, 0x1B0FF, // Kana Supplement
  196. 0x1D000, 0x1D0FF, // Byzantine Musical Symbols
  197. 0x1D100, 0x1D1FF, // Musical Symbols
  198. 0x1D200, 0x1D24F, // Ancient Greek Musical Notation
  199. 0x1D300, 0x1D35F, // Tai Xuan Jing Symbols
  200. 0x1D360, 0x1D37F, // Counting Rod Numerals
  201. 0x1D400, 0x1D7FF, // Mathematical Alphanumeric Symbols
  202. 0x1F000, 0x1F02F, // Mahjong Tiles
  203. 0x1F030, 0x1F09F, // Domino Tiles
  204. 0x1F0A0, 0x1F0FF, // Playing Cards
  205. 0x1F100, 0x1F1FF, // Enclosed Alphanumeric Supplement
  206. 0x1F200, 0x1F2FF, // Enclosed Ideographic Supplement
  207. 0x1F300, 0x1F5FF, // Miscellaneous Symbols And Pictographs
  208. 0x1F600, 0x1F64F, // Emoticons
  209. 0x1F680, 0x1F6FF, // Transport And Map Symbols
  210. 0x1F700, 0x1F77F, // Alchemical Symbols
  211. 0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B
  212. 0x2A700, 0x2B73F, // CJK Unified Ideographs Extension C
  213. 0x2B740, 0x2B81F, // CJK Unified Ideographs Extension D
  214. 0x2F800, 0x2FA1F, // CJK Compatibility Ideographs Supplement
  215. 0xE0000, 0xE007F, // Tags
  216. 0xE0100, 0xE01EF, // Variation Selectors Supplement
  217. 0xF0000, 0xFFFFF, // Supplementary Private Use Area-A
  218. 0x100000, 0x10FFFF, // Supplementary Private Use Area-B
  219. 0xFFFFFFFF
  220. };
  221. // Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
  222. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  223. #define UTF8_ACCEPT 0u
  224. #define UTF8_REJECT 12u
  225. static const unsigned char utf8d[] = {
  226. // The first part of the table maps bytes to character classes that
  227. // to reduce the size of the transition table and create bitmasks.
  228. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  229. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  230. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  231. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  232. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  233. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  234. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  235. 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  236. // The second part is a transition table that maps a combination
  237. // of a state of the automaton and a character class to a state.
  238. 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  239. 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  240. 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  241. 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  242. 12,36,12,12,12,12,12,12,12,12,12,12,
  243. };
  244. static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
  245. unsigned type = utf8d[byte];
  246. *codep = (*state != UTF8_ACCEPT) ?
  247. (byte & 0x3fu) | (*codep << 6) :
  248. (0xff >> type) & (byte);
  249. *state = utf8d[256 + *state + type];
  250. return *state;
  251. }
  252. //static bool IsUTF8(unsigned char* s) {
  253. // unsigned codepoint, state = 0;
  254. //
  255. // while (*s)
  256. // decode(&state, &codepoint, *s++);
  257. //
  258. // return state == UTF8_ACCEPT;
  259. //}
  260. TEST(EncodingsTest, UTF8) {
  261. StringBuffer os, os2;
  262. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  263. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  264. os.Clear();
  265. UTF8<>::Encode(os, codepoint);
  266. const char* encodedStr = os.GetString();
  267. // Decode with Hoehrmann
  268. {
  269. unsigned decodedCodepoint = 0;
  270. unsigned state = 0;
  271. unsigned decodedCount = 0;
  272. for (const char* s = encodedStr; *s; ++s)
  273. if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) {
  274. EXPECT_EQ(codepoint, decodedCodepoint);
  275. decodedCount++;
  276. }
  277. if (*encodedStr) // This decoder cannot handle U+0000
  278. EXPECT_EQ(1u, decodedCount); // Should only contain one code point
  279. EXPECT_EQ(UTF8_ACCEPT, state);
  280. if (UTF8_ACCEPT != state)
  281. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  282. }
  283. // Decode
  284. {
  285. StringStream is(encodedStr);
  286. unsigned decodedCodepoint;
  287. bool result = UTF8<>::Decode(is, &decodedCodepoint);
  288. EXPECT_TRUE(result);
  289. EXPECT_EQ(codepoint, decodedCodepoint);
  290. if (!result || codepoint != decodedCodepoint)
  291. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  292. }
  293. // Validate
  294. {
  295. StringStream is(encodedStr);
  296. os2.Clear();
  297. bool result = UTF8<>::Validate(is, os2);
  298. EXPECT_TRUE(result);
  299. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  300. }
  301. }
  302. }
  303. }
  304. TEST(EncodingsTest, UTF16) {
  305. GenericStringBuffer<UTF16<> > os, os2;
  306. GenericStringBuffer<UTF8<> > utf8os;
  307. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  308. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  309. os.Clear();
  310. UTF16<>::Encode(os, codepoint);
  311. const UTF16<>::Ch* encodedStr = os.GetString();
  312. // Encode with Hoehrmann's code
  313. if (codepoint != 0) // cannot handle U+0000
  314. {
  315. // encode with UTF8<> first
  316. utf8os.Clear();
  317. UTF8<>::Encode(utf8os, codepoint);
  318. // transcode from UTF8 to UTF16 with Hoehrmann's code
  319. unsigned decodedCodepoint = 0;
  320. unsigned state = 0;
  321. UTF16<>::Ch buffer[3], *p = &buffer[0];
  322. for (const char* s = utf8os.GetString(); *s; ++s) {
  323. if (!decode(&state, &decodedCodepoint, (unsigned char)*s))
  324. break;
  325. }
  326. if (codepoint <= 0xFFFF)
  327. *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
  328. else {
  329. // Encode code points above U+FFFF as surrogate pair.
  330. *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
  331. *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
  332. }
  333. *p++ = '\0';
  334. EXPECT_EQ(0, StrCmp(buffer, encodedStr));
  335. }
  336. // Decode
  337. {
  338. GenericStringStream<UTF16<> > is(encodedStr);
  339. unsigned decodedCodepoint;
  340. bool result = UTF16<>::Decode(is, &decodedCodepoint);
  341. EXPECT_TRUE(result);
  342. EXPECT_EQ(codepoint, decodedCodepoint);
  343. if (!result || codepoint != decodedCodepoint)
  344. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  345. }
  346. // Validate
  347. {
  348. GenericStringStream<UTF16<> > is(encodedStr);
  349. os2.Clear();
  350. bool result = UTF16<>::Validate(is, os2);
  351. EXPECT_TRUE(result);
  352. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  353. }
  354. }
  355. }
  356. }
  357. TEST(EncodingsTest, UTF32) {
  358. GenericStringBuffer<UTF32<> > os, os2;
  359. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  360. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  361. os.Clear();
  362. UTF32<>::Encode(os, codepoint);
  363. const UTF32<>::Ch* encodedStr = os.GetString();
  364. // Decode
  365. {
  366. GenericStringStream<UTF32<> > is(encodedStr);
  367. unsigned decodedCodepoint;
  368. bool result = UTF32<>::Decode(is, &decodedCodepoint);
  369. EXPECT_TRUE(result);
  370. EXPECT_EQ(codepoint, decodedCodepoint);
  371. if (!result || codepoint != decodedCodepoint)
  372. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  373. }
  374. // Validate
  375. {
  376. GenericStringStream<UTF32<> > is(encodedStr);
  377. os2.Clear();
  378. bool result = UTF32<>::Validate(is, os2);
  379. EXPECT_TRUE(result);
  380. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  381. }
  382. }
  383. }
  384. }