TextEncodingTest.cpp 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. 
  2. #include "../testTools.h"
  3. #include "../../DFPSR/api/stringAPI.h"
  4. // These tests will fail if the source code document or stored files change their encoding of line breaks.
  5. String expected_latin1 =
  6. R"QUOTE(Hello my friend
  7. Hej min vän
  8. Halló, vinur minn
  9. Hei ystäväni
  10. Hola mi amigo
  11. Ciao amico
  12. )QUOTE";
  13. // Warning!
  14. // String literals containing characters above value 255 must be stored explicitly in unicode literals using U"" instead of "".
  15. // Because string literals do not begin with a byte order mark to say which encoding is being used.
  16. // Also make sure to save the source code document using a byte order mark so that the C++ compiler receives the correct symbol.
  17. String unicodeContent =
  18. UR"QUOTE(Hello my friend
  19. Hej min vän
  20. Halló, vinur minn
  21. Hei ystäväni
  22. Hola mi amigo
  23. Ciao amico
  24. 你好我的朋友
  25. こんにちは、友よ
  26. नमस्ते मेरो साथी
  27. Talofa laʻu uo
  28. Xin chào bạn của tôi
  29. העלא מיין פרייַנד
  30. 안녕 내 친구
  31. سلام دوست من
  32. ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
  33. ওহে, বন্ধু আমার
  34. សួស្តី​សម្លាញ់
  35. Γεια σου φίλε μου
  36. Привет, мой друг
  37. здраво пријатељу
  38. Բարեւ իմ ընկեր
  39. ආයුබෝවන් මාගේ යාළුවා
  40. ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
  41. Silav hevalê min
  42. اهلا صديقي
  43. 𐐷
  44. )QUOTE";
  45. String expected_utf8 = unicodeContent + U"\nThis is UTF-8";
  46. String expected_utf16le = unicodeContent + U"\nThis is UTF-16 Little Endian";
  47. String expected_utf16be = unicodeContent + U"\nThis is UTF-16 Big Endian";
  48. void printBinary(uint32_t value, int maxBits) {
  49. for (int i = 0; i < maxBits; i++) {
  50. if (value & (uint32_t)0b1 << (maxBits - 1)) {
  51. printText(U"1");
  52. } else {
  53. printText(U"0");
  54. }
  55. value = value << 1;
  56. }
  57. }
  58. void printBuffer(Buffer buffer) {
  59. int length = buffer_getSize(buffer);
  60. SafePointer<uint8_t> data = buffer_getSafeData<uint8_t>(buffer, "Generic buffer");
  61. printText(U"Buffer of length ", length, U":\n");
  62. for (int i = 0; i < length; i++) {
  63. printBinary(data[i], 8);
  64. printText(U" @", i, U"\n");
  65. }
  66. }
  67. START_TEST(TextEncoding)
  68. String folderPath = file_combinePaths(U".", U"resources");
  69. // Check that we have a valid folder path to the resources.
  70. ASSERT_EQUAL(file_getEntryType(folderPath), EntryType::Folder);
  71. { // Text encodings stored in memory
  72. // Run these tests for all line encodings
  73. for (int l = 0; l <= 1; l++) {
  74. LineEncoding lineEncoding = (l == 0) ? LineEncoding::CrLf : LineEncoding::Lf;
  75. // \r is not saved to files for cross-platform compatibility
  76. // \0 is not saved to files because files have a known size and don't need them
  77. { // Latin-1 up to U+FF excluding \r and \0
  78. String originalLatin1;
  79. string_reserve(originalLatin1, 0xFF);
  80. for (DsrChar c = 0x1; c <= 0xFF; c++) {
  81. if (c != U'\r') {
  82. string_appendChar(originalLatin1, c);
  83. }
  84. }
  85. Buffer encoded = string_saveToMemory(originalLatin1, CharacterEncoding::Raw_Latin1, lineEncoding);
  86. String decodedLatin1 = string_loadFromMemory(encoded);
  87. ASSERT_EQUAL(originalLatin1, decodedLatin1);
  88. }
  89. { // UTF-8 up to U+10FFFF excluding \r and \0
  90. String originalUTF8;
  91. string_reserve(originalUTF8, 0x10FFFF);
  92. for (DsrChar c = 0x1; c <= 0x10FFFF; c++) {
  93. if (c != U'\r') {
  94. string_appendChar(originalUTF8, c);
  95. }
  96. }
  97. Buffer encoded = string_saveToMemory(originalUTF8, CharacterEncoding::BOM_UTF8, lineEncoding);
  98. String decodedUTF8 = string_loadFromMemory(encoded);
  99. ASSERT_EQUAL(originalUTF8, decodedUTF8);
  100. }
  101. // Selected cases for UTF-16
  102. for (int e = 0; e <= 1; e++) {
  103. CharacterEncoding characterEncoding = (e == 0) ? CharacterEncoding::BOM_UTF16BE : CharacterEncoding::BOM_UTF16LE;
  104. String originalUTF16;
  105. // 20-bit test cases
  106. string_appendChar(originalUTF16, 0b00000000000000000001);
  107. string_appendChar(originalUTF16, 0b00000000000000000010);
  108. string_appendChar(originalUTF16, 0b00000000000000000011);
  109. string_appendChar(originalUTF16, 0b00000000000000000100);
  110. string_appendChar(originalUTF16, 0b00000000000000000111);
  111. string_appendChar(originalUTF16, 0b00000000000000001000);
  112. string_appendChar(originalUTF16, 0b00000000000000001111);
  113. string_appendChar(originalUTF16, 0b00000000000000010000);
  114. string_appendChar(originalUTF16, 0b00000000000000011111);
  115. string_appendChar(originalUTF16, 0b00000000000000100000);
  116. string_appendChar(originalUTF16, 0b00000000000000111111);
  117. string_appendChar(originalUTF16, 0b00000000000001000000);
  118. string_appendChar(originalUTF16, 0b00000000000001111111);
  119. string_appendChar(originalUTF16, 0b00000000000010000000);
  120. string_appendChar(originalUTF16, 0b00000000000011111111);
  121. string_appendChar(originalUTF16, 0b00000000000100000000);
  122. string_appendChar(originalUTF16, 0b00000000000111111111);
  123. string_appendChar(originalUTF16, 0b00000000001000000000);
  124. string_appendChar(originalUTF16, 0b00000000001111111111);
  125. string_appendChar(originalUTF16, 0b00000000010000000000);
  126. string_appendChar(originalUTF16, 0b00000000011111111111);
  127. string_appendChar(originalUTF16, 0b00000000100000000000);
  128. string_appendChar(originalUTF16, 0b00000000111111111111);
  129. string_appendChar(originalUTF16, 0b00000001000000000000);
  130. string_appendChar(originalUTF16, 0b00000001111111111111);
  131. string_appendChar(originalUTF16, 0b00000010000000000000);
  132. string_appendChar(originalUTF16, 0b00000011111111111111);
  133. string_appendChar(originalUTF16, 0b00000100000000000000);
  134. string_appendChar(originalUTF16, 0b00000111111111111111);
  135. string_appendChar(originalUTF16, 0b00001000000000000000);
  136. string_appendChar(originalUTF16, 0b00001111111111111111);
  137. string_appendChar(originalUTF16, 0b00010000000000000000);
  138. string_appendChar(originalUTF16, 0b00011111111111111111);
  139. string_appendChar(originalUTF16, 0b00100000000000000000);
  140. string_appendChar(originalUTF16, 0b00111111111111111111);
  141. string_appendChar(originalUTF16, 0b01000000000000000000);
  142. string_appendChar(originalUTF16, 0b01111111111111111111);
  143. string_appendChar(originalUTF16, 0b10000000000000000000);
  144. string_appendChar(originalUTF16, 0b11111111111111111111);
  145. // 21-bit test cases exploiting the high range offset
  146. string_appendChar(originalUTF16, 0x100000); // Using the 21:st bit
  147. string_appendChar(originalUTF16, 0x10FFFF); // Maximum range for UTF
  148. Buffer encoded = string_saveToMemory(originalUTF16, characterEncoding, lineEncoding);
  149. String decoded = string_loadFromMemory(encoded);
  150. ASSERT_EQUAL(originalUTF16, decoded);
  151. }
  152. // All UTF-16 characters excluding \r and \0
  153. for (int e = 0; e <= 1; e++) {
  154. CharacterEncoding characterEncoding = (e == 0) ? CharacterEncoding::BOM_UTF16BE : CharacterEncoding::BOM_UTF16LE;
  155. String original;
  156. string_reserve(original, 0x10FFFF);
  157. for (DsrChar c = 0x1; c <= 0xD7FF; c++) {
  158. if (c != U'\r') {
  159. string_appendChar(original, c);
  160. }
  161. }
  162. // 0xD800 to 0xDFFF is reserved for
  163. for (DsrChar c = 0xE000; c <= 0x10FFFF; c++) {
  164. string_appendChar(original, c);
  165. }
  166. Buffer encoded = string_saveToMemory(original, characterEncoding, lineEncoding);
  167. String decoded = string_loadFromMemory(encoded);
  168. ASSERT_EQUAL(original, decoded);
  169. }
  170. }
  171. }
  172. { // Loading strings of different encodings
  173. String fileLatin1 = string_load(file_combinePaths(folderPath, U"Latin1.txt"), true);
  174. ASSERT_EQUAL(fileLatin1, expected_latin1);
  175. String fileUTF8 = string_load(file_combinePaths(folderPath, U"BomUtf8.txt"), true);
  176. ASSERT_EQUAL(fileUTF8, expected_utf8);
  177. String fileUTF16LE = string_load(file_combinePaths(folderPath, U"BomUtf16Le.txt"), true);
  178. ASSERT_EQUAL(fileUTF16LE, expected_utf16le);
  179. String fileUTF16BE = string_load(file_combinePaths(folderPath, U"BomUtf16Be.txt"), true);
  180. ASSERT_EQUAL(fileUTF16BE, expected_utf16be);
  181. }
  182. { // Saving and loading text to files using every combination of character and line encoding
  183. String originalContent = U"Hello my friend\n你好我的朋友\n𐐷𤭢\n";
  184. String latin1Expected = U"Hello my friend\n??????\n??\n";
  185. String tempPath = folderPath + U"Temporary.txt";
  186. for (int l = 0; l < 2; l++) {
  187. LineEncoding lineEncoding = (l == 0) ? LineEncoding::CrLf : LineEncoding::Lf;
  188. // Latin-1 should store up to 8 bits correctly, and write ? for complex characters
  189. string_save(tempPath, originalContent, CharacterEncoding::Raw_Latin1, lineEncoding);
  190. String latin1Loaded = string_load(tempPath, true);
  191. ASSERT_EQUAL(latin1Loaded, latin1Expected);
  192. // UFT-8 should store up to 21 bits correctly
  193. string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF8, lineEncoding);
  194. ASSERT_EQUAL(string_load(tempPath, true), unicodeContent);
  195. // UFT-16 should store up to 20 bits correctly
  196. string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16BE, lineEncoding);
  197. ASSERT_EQUAL(string_load(tempPath, true), unicodeContent);
  198. string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16LE, lineEncoding);
  199. ASSERT_EQUAL(string_load(tempPath, true), unicodeContent);
  200. string_save(tempPath, U"This file is used when testing text encoding.");
  201. }
  202. }
  203. END_TEST