ConvertUTFTest.cpp 61 KB


  1. //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #include "llvm/Support/ConvertUTF.h"
  10. #include "llvm/Support/Format.h"
  11. #include "gtest/gtest.h"
  12. #include <string>
  13. #include <utility>
  14. #include <vector>
  15. using namespace llvm;
  16. TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
  17. // Src is the look of disapproval.
  18. static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
  19. ArrayRef<char> Ref(Src, sizeof(Src) - 1);
  20. std::string Result;
  21. bool Success = convertUTF16ToUTF8String(Ref, Result);
  22. EXPECT_TRUE(Success);
  23. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  24. EXPECT_EQ(Expected, Result);
  25. }
  26. TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
  27. // Src is the look of disapproval.
  28. static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
  29. ArrayRef<char> Ref(Src, sizeof(Src) - 1);
  30. std::string Result;
  31. bool Success = convertUTF16ToUTF8String(Ref, Result);
  32. EXPECT_TRUE(Success);
  33. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  34. EXPECT_EQ(Expected, Result);
  35. }
  36. TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
  37. // Src is the look of disapproval.
  38. static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
  39. StringRef Ref(Src, sizeof(Src) - 1);
  40. SmallVector<UTF16, 5> Result;
  41. bool Success = convertUTF8ToUTF16String(Ref, Result);
  42. EXPECT_TRUE(Success);
  43. static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
  44. ASSERT_EQ(3u, Result.size());
  45. for (int I = 0, E = 3; I != E; ++I)
  46. EXPECT_EQ(Expected[I], Result[I]);
  47. }
  48. TEST(ConvertUTFTest, OddLengthInput) {
  49. std::string Result;
  50. bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
  51. EXPECT_FALSE(Success);
  52. }
  53. TEST(ConvertUTFTest, Empty) {
  54. std::string Result;
  55. bool Success = convertUTF16ToUTF8String(None, Result);
  56. EXPECT_TRUE(Success);
  57. EXPECT_TRUE(Result.empty());
  58. }
  59. TEST(ConvertUTFTest, HasUTF16BOM) {
  60. bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
  61. EXPECT_TRUE(HasBOM);
  62. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
  63. EXPECT_TRUE(HasBOM);
  64. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
  65. EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
  66. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
  67. EXPECT_TRUE(HasBOM);
  68. HasBOM = hasUTF16ByteOrderMark(None);
  69. EXPECT_FALSE(HasBOM);
  70. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
  71. EXPECT_FALSE(HasBOM);
  72. }
  73. struct ConvertUTFResultContainer {
  74. ConversionResult ErrorCode;
  75. std::vector<unsigned> UnicodeScalars;
  76. ConvertUTFResultContainer(ConversionResult ErrorCode)
  77. : ErrorCode(ErrorCode) {}
  78. ConvertUTFResultContainer
  79. withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
  80. unsigned US2 = 0x110000, unsigned US3 = 0x110000,
  81. unsigned US4 = 0x110000, unsigned US5 = 0x110000,
  82. unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
  83. ConvertUTFResultContainer Result(*this);
  84. if (US0 != 0x110000)
  85. Result.UnicodeScalars.push_back(US0);
  86. if (US1 != 0x110000)
  87. Result.UnicodeScalars.push_back(US1);
  88. if (US2 != 0x110000)
  89. Result.UnicodeScalars.push_back(US2);
  90. if (US3 != 0x110000)
  91. Result.UnicodeScalars.push_back(US3);
  92. if (US4 != 0x110000)
  93. Result.UnicodeScalars.push_back(US4);
  94. if (US5 != 0x110000)
  95. Result.UnicodeScalars.push_back(US5);
  96. if (US6 != 0x110000)
  97. Result.UnicodeScalars.push_back(US6);
  98. if (US7 != 0x110000)
  99. Result.UnicodeScalars.push_back(US7);
  100. return Result;
  101. }
  102. };
  103. std::pair<ConversionResult, std::vector<unsigned>>
  104. ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
  105. const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
  106. const UTF8 *SourceNext = SourceStart;
  107. std::vector<UTF32> Decoded(S.size(), 0);
  108. UTF32 *TargetStart = Decoded.data();
  109. auto ErrorCode =
  110. ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
  111. Decoded.data() + Decoded.size(), lenientConversion);
  112. Decoded.resize(TargetStart - Decoded.data());
  113. return std::make_pair(ErrorCode, Decoded);
  114. }
  115. std::pair<ConversionResult, std::vector<unsigned>>
  116. ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
  117. const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
  118. const UTF8 *SourceNext = SourceStart;
  119. std::vector<UTF32> Decoded(S.size(), 0);
  120. UTF32 *TargetStart = Decoded.data();
  121. auto ErrorCode = ConvertUTF8toUTF32Partial(
  122. &SourceNext, SourceStart + S.size(), &TargetStart,
  123. Decoded.data() + Decoded.size(), lenientConversion);
  124. Decoded.resize(TargetStart - Decoded.data());
  125. return std::make_pair(ErrorCode, Decoded);
  126. }
  127. ::testing::AssertionResult
  128. CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
  129. StringRef S, bool Partial = false) {
  130. ConversionResult ErrorCode;
  131. std::vector<unsigned> Decoded;
  132. if (!Partial)
  133. std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
  134. else
  135. std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
  136. if (Expected.ErrorCode != ErrorCode)
  137. return ::testing::AssertionFailure() << "Expected error code "
  138. << Expected.ErrorCode << ", actual "
  139. << ErrorCode;
  140. if (Expected.UnicodeScalars != Decoded)
  141. return ::testing::AssertionFailure()
  142. << "Expected lenient decoded result:\n"
  143. << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
  144. << "Actual result:\n" << ::testing::PrintToString(Decoded);
  145. return ::testing::AssertionSuccess();
  146. }
  147. TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
  148. //
  149. // 1-byte sequences
  150. //
  151. // U+0041 LATIN CAPITAL LETTER A
  152. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  153. ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
  154. //
  155. // 2-byte sequences
  156. //
  157. // U+0283 LATIN SMALL LETTER ESH
  158. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  159. ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
  160. "\xca\x83"));
  161. // U+03BA GREEK SMALL LETTER KAPPA
  162. // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
  163. // U+03C3 GREEK SMALL LETTER SIGMA
  164. // U+03BC GREEK SMALL LETTER MU
  165. // U+03B5 GREEK SMALL LETTER EPSILON
  166. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  167. ConvertUTFResultContainer(conversionOK)
  168. .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
  169. "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
  170. //
  171. // 3-byte sequences
  172. //
  173. // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
  174. // U+6587 CJK UNIFIED IDEOGRAPH-6587
  175. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  176. ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
  177. "\xe4\xbe\x8b\xe6\x96\x87"));
  178. // U+D55C HANGUL SYLLABLE HAN
  179. // U+AE00 HANGUL SYLLABLE GEUL
  180. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  181. ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
  182. "\xed\x95\x9c\xea\xb8\x80"));
  183. // U+1112 HANGUL CHOSEONG HIEUH
  184. // U+1161 HANGUL JUNGSEONG A
  185. // U+11AB HANGUL JONGSEONG NIEUN
  186. // U+1100 HANGUL CHOSEONG KIYEOK
  187. // U+1173 HANGUL JUNGSEONG EU
  188. // U+11AF HANGUL JONGSEONG RIEUL
  189. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  190. ConvertUTFResultContainer(conversionOK)
  191. .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
  192. "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
  193. "\xe1\x86\xaf"));
  194. //
  195. // 4-byte sequences
  196. //
  197. // U+E0100 VARIATION SELECTOR-17
  198. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  199. ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
  200. "\xf3\xa0\x84\x80"));
  201. //
  202. // First possible sequence of a certain length
  203. //
  204. // U+0000 NULL
  205. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  206. ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
  207. StringRef("\x00", 1)));
  208. // U+0080 PADDING CHARACTER
  209. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  210. ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
  211. "\xc2\x80"));
  212. // U+0800 SAMARITAN LETTER ALAF
  213. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  214. ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
  215. "\xe0\xa0\x80"));
  216. // U+10000 LINEAR B SYLLABLE B008 A
  217. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  218. ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
  219. "\xf0\x90\x80\x80"));
  220. // U+200000 (invalid)
  221. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  222. ConvertUTFResultContainer(sourceIllegal)
  223. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  224. "\xf8\x88\x80\x80\x80"));
  225. // U+4000000 (invalid)
  226. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  227. ConvertUTFResultContainer(sourceIllegal)
  228. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  229. "\xfc\x84\x80\x80\x80\x80"));
  230. //
  231. // Last possible sequence of a certain length
  232. //
  233. // U+007F DELETE
  234. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  235. ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
  236. // U+07FF (unassigned)
  237. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  238. ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
  239. "\xdf\xbf"));
  240. // U+FFFF (noncharacter)
  241. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  242. ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
  243. "\xef\xbf\xbf"));
  244. // U+1FFFFF (invalid)
  245. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  246. ConvertUTFResultContainer(sourceIllegal)
  247. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  248. "\xf7\xbf\xbf\xbf"));
  249. // U+3FFFFFF (invalid)
  250. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  251. ConvertUTFResultContainer(sourceIllegal)
  252. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  253. "\xfb\xbf\xbf\xbf\xbf"));
  254. // U+7FFFFFFF (invalid)
  255. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  256. ConvertUTFResultContainer(sourceIllegal)
  257. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  258. "\xfd\xbf\xbf\xbf\xbf\xbf"));
  259. //
  260. // Other boundary conditions
  261. //
  262. // U+D7FF (unassigned)
  263. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  264. ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
  265. "\xed\x9f\xbf"));
  266. // U+E000 (private use)
  267. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  268. ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
  269. "\xee\x80\x80"));
  270. // U+FFFD REPLACEMENT CHARACTER
  271. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  272. ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
  273. "\xef\xbf\xbd"));
  274. // U+10FFFF (noncharacter)
  275. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  276. ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
  277. "\xf4\x8f\xbf\xbf"));
  278. // U+110000 (invalid)
  279. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  280. ConvertUTFResultContainer(sourceIllegal)
  281. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  282. "\xf4\x90\x80\x80"));
  283. //
  284. // Unexpected continuation bytes
  285. //
  286. // A sequence of unexpected continuation bytes that don't follow a first
  287. // byte, every byte is a maximal subpart.
  288. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  289. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
  290. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  291. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
  292. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  293. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  294. "\x80\x80"));
  295. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  296. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  297. "\x80\xbf"));
  298. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  299. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  300. "\xbf\x80"));
  301. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  302. ConvertUTFResultContainer(sourceIllegal)
  303. .withScalars(0xfffd, 0xfffd, 0xfffd),
  304. "\x80\xbf\x80"));
  305. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  306. ConvertUTFResultContainer(sourceIllegal)
  307. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  308. "\x80\xbf\x80\xbf"));
  309. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  310. ConvertUTFResultContainer(sourceIllegal)
  311. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  312. "\x80\xbf\x82\xbf\xaa"));
  313. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  314. ConvertUTFResultContainer(sourceIllegal)
  315. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  316. "\xaa\xb0\xbb\xbf\xaa\xa0"));
  317. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  318. ConvertUTFResultContainer(sourceIllegal)
  319. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  320. "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
  321. // All continuation bytes (0x80--0xbf).
  322. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  323. ConvertUTFResultContainer(sourceIllegal)
  324. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  325. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  326. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  327. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  328. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  329. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  330. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  331. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  332. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  333. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  334. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  335. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  336. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  337. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  338. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  339. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  340. "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
  341. "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
  342. "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
  343. "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
  344. //
  345. // Lonely start bytes
  346. //
  347. // Start bytes of 2-byte sequences (0xc0--0xdf).
  348. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  349. ConvertUTFResultContainer(sourceIllegal)
  350. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  351. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  352. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  353. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  354. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  355. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  356. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  357. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  358. "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
  359. "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
  360. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  361. ConvertUTFResultContainer(sourceIllegal)
  362. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  363. 0xfffd, 0x0020, 0xfffd, 0x0020)
  364. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  365. 0xfffd, 0x0020, 0xfffd, 0x0020)
  366. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  367. 0xfffd, 0x0020, 0xfffd, 0x0020)
  368. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  369. 0xfffd, 0x0020, 0xfffd, 0x0020)
  370. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  371. 0xfffd, 0x0020, 0xfffd, 0x0020)
  372. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  373. 0xfffd, 0x0020, 0xfffd, 0x0020)
  374. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  375. 0xfffd, 0x0020, 0xfffd, 0x0020)
  376. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  377. 0xfffd, 0x0020, 0xfffd, 0x0020),
  378. "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
  379. "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
  380. "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
  381. "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
  382. // Start bytes of 3-byte sequences (0xe0--0xef).
  383. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  384. ConvertUTFResultContainer(sourceIllegal)
  385. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  386. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  387. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  388. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  389. "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
  390. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  391. ConvertUTFResultContainer(sourceIllegal)
  392. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  393. 0xfffd, 0x0020, 0xfffd, 0x0020)
  394. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  395. 0xfffd, 0x0020, 0xfffd, 0x0020)
  396. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  397. 0xfffd, 0x0020, 0xfffd, 0x0020)
  398. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  399. 0xfffd, 0x0020, 0xfffd, 0x0020),
  400. "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
  401. "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
  402. // Start bytes of 4-byte sequences (0xf0--0xf7).
  403. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  404. ConvertUTFResultContainer(sourceIllegal)
  405. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  406. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  407. "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
  408. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  409. ConvertUTFResultContainer(sourceIllegal)
  410. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  411. 0xfffd, 0x0020, 0xfffd, 0x0020)
  412. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  413. 0xfffd, 0x0020, 0xfffd, 0x0020),
  414. "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
  415. // Start bytes of 5-byte sequences (0xf8--0xfb).
  416. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  417. ConvertUTFResultContainer(sourceIllegal)
  418. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  419. "\xf8\xf9\xfa\xfb"));
  420. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  421. ConvertUTFResultContainer(sourceIllegal)
  422. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  423. 0xfffd, 0x0020, 0xfffd, 0x0020),
  424. "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
  425. // Start bytes of 6-byte sequences (0xfc--0xfd).
  426. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  427. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  428. "\xfc\xfd"));
  429. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  430. ConvertUTFResultContainer(sourceIllegal)
  431. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
  432. "\xfc\x20\xfd\x20"));
  433. //
  434. // Other bytes (0xc0--0xc1, 0xfe--0xff).
  435. //
  436. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  437. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
  438. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  439. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
  440. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  441. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
  442. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  443. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
  444. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  445. ConvertUTFResultContainer(sourceIllegal)
  446. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  447. "\xc0\xc1\xfe\xff"));
  448. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  449. ConvertUTFResultContainer(sourceIllegal)
  450. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  451. "\xfe\xfe\xff\xff"));
  452. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  453. ConvertUTFResultContainer(sourceIllegal)
  454. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  455. "\xfe\x80\x80\x80\x80\x80"));
  456. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  457. ConvertUTFResultContainer(sourceIllegal)
  458. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  459. "\xff\x80\x80\x80\x80\x80"));
  460. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  461. ConvertUTFResultContainer(sourceIllegal)
  462. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  463. 0xfffd, 0x0020, 0xfffd, 0x0020),
  464. "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
  465. //
  466. // Sequences with one continuation byte missing
  467. //
  468. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  469. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
  470. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  471. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
  472. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  473. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  474. "\xe0\xa0"));
  475. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  476. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  477. "\xe0\xbf"));
  478. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  479. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  480. "\xe1\x80"));
  481. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  482. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  483. "\xec\xbf"));
  484. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  485. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  486. "\xed\x80"));
  487. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  488. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  489. "\xed\x9f"));
  490. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  491. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  492. "\xee\x80"));
  493. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  494. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  495. "\xef\xbf"));
  496. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  497. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  498. "\xf0\x90\x80"));
  499. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  500. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  501. "\xf0\xbf\xbf"));
  502. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  503. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  504. "\xf1\x80\x80"));
  505. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  506. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  507. "\xf3\xbf\xbf"));
  508. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  509. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  510. "\xf4\x80\x80"));
  511. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  512. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  513. "\xf4\x8f\xbf"));
  514. // Overlong sequences with one trailing byte missing.
  515. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  516. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  517. "\xc0"));
  518. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  519. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  520. "\xc1"));
  521. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  522. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  523. "\xe0\x80"));
  524. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  525. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  526. "\xe0\x9f"));
  527. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  528. ConvertUTFResultContainer(sourceIllegal)
  529. .withScalars(0xfffd, 0xfffd, 0xfffd),
  530. "\xf0\x80\x80"));
  531. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  532. ConvertUTFResultContainer(sourceIllegal)
  533. .withScalars(0xfffd, 0xfffd, 0xfffd),
  534. "\xf0\x8f\x80"));
  535. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  536. ConvertUTFResultContainer(sourceIllegal)
  537. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  538. "\xf8\x80\x80\x80"));
  539. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  540. ConvertUTFResultContainer(sourceIllegal)
  541. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  542. "\xfc\x80\x80\x80\x80"));
  543. // Sequences that represent surrogates with one trailing byte missing.
  544. // High surrogates
  545. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  546. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  547. "\xed\xa0"));
  548. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  549. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  550. "\xed\xac"));
  551. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  552. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  553. "\xed\xaf"));
  554. // Low surrogates
  555. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  556. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  557. "\xed\xb0"));
  558. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  559. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  560. "\xed\xb4"));
  561. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  562. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  563. "\xed\xbf"));
  564. // Ill-formed 4-byte sequences.
  565. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  566. // U+1100xx (invalid)
  567. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  568. ConvertUTFResultContainer(sourceIllegal)
  569. .withScalars(0xfffd, 0xfffd, 0xfffd),
  570. "\xf4\x90\x80"));
  571. // U+13FBxx (invalid)
  572. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  573. ConvertUTFResultContainer(sourceIllegal)
  574. .withScalars(0xfffd, 0xfffd, 0xfffd),
  575. "\xf4\xbf\xbf"));
  576. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  577. ConvertUTFResultContainer(sourceIllegal)
  578. .withScalars(0xfffd, 0xfffd, 0xfffd),
  579. "\xf5\x80\x80"));
  580. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  581. ConvertUTFResultContainer(sourceIllegal)
  582. .withScalars(0xfffd, 0xfffd, 0xfffd),
  583. "\xf6\x80\x80"));
  584. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  585. ConvertUTFResultContainer(sourceIllegal)
  586. .withScalars(0xfffd, 0xfffd, 0xfffd),
  587. "\xf7\x80\x80"));
  588. // U+1FFBxx (invalid)
  589. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  590. ConvertUTFResultContainer(sourceIllegal)
  591. .withScalars(0xfffd, 0xfffd, 0xfffd),
  592. "\xf7\xbf\xbf"));
  593. // Ill-formed 5-byte sequences.
  594. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  595. // U+2000xx (invalid)
  596. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  597. ConvertUTFResultContainer(sourceIllegal)
  598. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  599. "\xf8\x88\x80\x80"));
  600. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  601. ConvertUTFResultContainer(sourceIllegal)
  602. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  603. "\xf8\xbf\xbf\xbf"));
  604. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  605. ConvertUTFResultContainer(sourceIllegal)
  606. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  607. "\xf9\x80\x80\x80"));
  608. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  609. ConvertUTFResultContainer(sourceIllegal)
  610. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  611. "\xfa\x80\x80\x80"));
  612. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  613. ConvertUTFResultContainer(sourceIllegal)
  614. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  615. "\xfb\x80\x80\x80"));
  616. // U+3FFFFxx (invalid)
  617. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  618. ConvertUTFResultContainer(sourceIllegal)
  619. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  620. "\xfb\xbf\xbf\xbf"));
  621. // Ill-formed 6-byte sequences.
  622. // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
  623. // U+40000xx (invalid)
  624. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  625. ConvertUTFResultContainer(sourceIllegal)
  626. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  627. "\xfc\x84\x80\x80\x80"));
  628. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  629. ConvertUTFResultContainer(sourceIllegal)
  630. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  631. "\xfc\xbf\xbf\xbf\xbf"));
  632. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  633. ConvertUTFResultContainer(sourceIllegal)
  634. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  635. "\xfd\x80\x80\x80\x80"));
  636. // U+7FFFFFxx (invalid)
  637. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  638. ConvertUTFResultContainer(sourceIllegal)
  639. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  640. "\xfd\xbf\xbf\xbf\xbf"));
  641. //
  642. // Sequences with two continuation bytes missing
  643. //
  644. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  645. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  646. "\xf0\x90"));
  647. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  648. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  649. "\xf0\xbf"));
  650. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  651. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  652. "\xf1\x80"));
  653. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  654. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  655. "\xf3\xbf"));
  656. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  657. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  658. "\xf4\x80"));
  659. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  660. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  661. "\xf4\x8f"));
  662. // Overlong sequences with two trailing byte missing.
  663. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  664. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
  665. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  666. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  667. "\xf0\x80"));
  668. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  669. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  670. "\xf0\x8f"));
  671. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  672. ConvertUTFResultContainer(sourceIllegal)
  673. .withScalars(0xfffd, 0xfffd, 0xfffd),
  674. "\xf8\x80\x80"));
  675. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  676. ConvertUTFResultContainer(sourceIllegal)
  677. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  678. "\xfc\x80\x80\x80"));
  679. // Sequences that represent surrogates with two trailing bytes missing.
  680. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  681. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
  682. // Ill-formed 4-byte sequences.
  683. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  684. // U+110yxx (invalid)
  685. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  686. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  687. "\xf4\x90"));
  688. // U+13Fyxx (invalid)
  689. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  690. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  691. "\xf4\xbf"));
  692. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  693. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  694. "\xf5\x80"));
  695. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  696. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  697. "\xf6\x80"));
  698. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  699. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  700. "\xf7\x80"));
  701. // U+1FFyxx (invalid)
  702. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  703. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  704. "\xf7\xbf"));
  705. // Ill-formed 5-byte sequences.
  706. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  707. // U+200yxx (invalid)
  708. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  709. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  710. "\xf8\x88\x80"));
  711. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  712. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  713. "\xf8\xbf\xbf"));
  714. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  715. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  716. "\xf9\x80\x80"));
  717. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  718. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  719. "\xfa\x80\x80"));
  720. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  721. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  722. "\xfb\x80\x80"));
  723. // U+3FFFyxx (invalid)
  724. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  725. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  726. "\xfb\xbf\xbf"));
  727. // Ill-formed 6-byte sequences.
  728. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  729. // U+4000yxx (invalid)
  730. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  731. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  732. "\xfc\x84\x80\x80"));
  733. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  734. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  735. "\xfc\xbf\xbf\xbf"));
  736. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  737. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  738. "\xfd\x80\x80\x80"));
  739. // U+7FFFFyxx (invalid)
  740. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  741. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  742. "\xfd\xbf\xbf\xbf"));
  743. //
  744. // Sequences with three continuation bytes missing
  745. //
  746. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  747. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
  748. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  749. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
  750. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  751. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
  752. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  753. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
  754. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  755. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
  756. // Broken overlong sequences.
  757. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  758. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
  759. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  760. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  761. "\xf8\x80"));
  762. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  763. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  764. "\xfc\x80\x80"));
  765. // Ill-formed 4-byte sequences.
  766. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  767. // U+14yyxx (invalid)
  768. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  769. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
  770. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  771. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
  772. // U+1Cyyxx (invalid)
  773. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  774. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
  775. // Ill-formed 5-byte sequences.
  776. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  777. // U+20yyxx (invalid)
  778. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  779. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  780. "\xf8\x88"));
  781. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  782. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  783. "\xf8\xbf"));
  784. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  785. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  786. "\xf9\x80"));
  787. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  788. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  789. "\xfa\x80"));
  790. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  791. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  792. "\xfb\x80"));
  793. // U+3FCyyxx (invalid)
  794. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  795. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  796. "\xfb\xbf"));
  797. // Ill-formed 6-byte sequences.
  798. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  799. // U+400yyxx (invalid)
  800. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  801. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  802. "\xfc\x84\x80"));
  803. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  804. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  805. "\xfc\xbf\xbf"));
  806. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  807. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  808. "\xfd\x80\x80"));
  809. // U+7FFCyyxx (invalid)
  810. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  811. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  812. "\xfd\xbf\xbf"));
  813. //
  814. // Sequences with four continuation bytes missing
  815. //
  816. // Ill-formed 5-byte sequences.
  817. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  818. // U+uzyyxx (invalid)
  819. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  820. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
  821. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  822. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
  823. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  824. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
  825. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  826. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
  827. // U+3zyyxx (invalid)
  828. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  829. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
  830. // Broken overlong sequences.
  831. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  832. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
  833. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  834. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  835. "\xfc\x80"));
  836. // Ill-formed 6-byte sequences.
  837. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  838. // U+uzzyyxx (invalid)
  839. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  840. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  841. "\xfc\x84"));
  842. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  843. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  844. "\xfc\xbf"));
  845. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  846. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  847. "\xfd\x80"));
  848. // U+7Fzzyyxx (invalid)
  849. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  850. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  851. "\xfd\xbf"));
  852. //
  853. // Sequences with five continuation bytes missing
  854. //
  855. // Ill-formed 6-byte sequences.
  856. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  857. // U+uzzyyxx (invalid)
  858. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  859. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
  860. // U+uuzzyyxx (invalid)
  861. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  862. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
  863. //
  864. // Consecutive sequences with trailing bytes missing
  865. //
  866. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  867. ConvertUTFResultContainer(sourceIllegal)
  868. .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
  869. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
  870. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  871. .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
  872. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
  873. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  874. "\xc0" "\xe0\x80" "\xf0\x80\x80"
  875. "\xf8\x80\x80\x80"
  876. "\xfc\x80\x80\x80\x80"
  877. "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
  878. "\xfb\xbf\xbf\xbf"
  879. "\xfd\xbf\xbf\xbf\xbf"));
  880. //
  881. // Overlong UTF-8 sequences
  882. //
  883. // U+002F SOLIDUS
  884. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  885. ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
  886. // Overlong sequences of the above.
  887. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  888. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  889. "\xc0\xaf"));
  890. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  891. ConvertUTFResultContainer(sourceIllegal)
  892. .withScalars(0xfffd, 0xfffd, 0xfffd),
  893. "\xe0\x80\xaf"));
  894. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  895. ConvertUTFResultContainer(sourceIllegal)
  896. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  897. "\xf0\x80\x80\xaf"));
  898. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  899. ConvertUTFResultContainer(sourceIllegal)
  900. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  901. "\xf8\x80\x80\x80\xaf"));
  902. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  903. ConvertUTFResultContainer(sourceIllegal)
  904. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  905. "\xfc\x80\x80\x80\x80\xaf"));
  906. // U+0000 NULL
  907. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  908. ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
  909. StringRef("\x00", 1)));
  910. // Overlong sequences of the above.
  911. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  912. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  913. "\xc0\x80"));
  914. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  915. ConvertUTFResultContainer(sourceIllegal)
  916. .withScalars(0xfffd, 0xfffd, 0xfffd),
  917. "\xe0\x80\x80"));
  918. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  919. ConvertUTFResultContainer(sourceIllegal)
  920. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  921. "\xf0\x80\x80\x80"));
  922. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  923. ConvertUTFResultContainer(sourceIllegal)
  924. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  925. "\xf8\x80\x80\x80\x80"));
  926. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  927. ConvertUTFResultContainer(sourceIllegal)
  928. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  929. "\xfc\x80\x80\x80\x80\x80"));
  930. // Other overlong sequences.
  931. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  932. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  933. "\xc0\xbf"));
  934. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  935. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  936. "\xc1\x80"));
  937. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  938. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  939. "\xc1\xbf"));
  940. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  941. ConvertUTFResultContainer(sourceIllegal)
  942. .withScalars(0xfffd, 0xfffd, 0xfffd),
  943. "\xe0\x9f\xbf"));
  944. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  945. ConvertUTFResultContainer(sourceIllegal)
  946. .withScalars(0xfffd, 0xfffd, 0xfffd),
  947. "\xed\xa0\x80"));
  948. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  949. ConvertUTFResultContainer(sourceIllegal)
  950. .withScalars(0xfffd, 0xfffd, 0xfffd),
  951. "\xed\xbf\xbf"));
  952. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  953. ConvertUTFResultContainer(sourceIllegal)
  954. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  955. "\xf0\x8f\x80\x80"));
  956. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  957. ConvertUTFResultContainer(sourceIllegal)
  958. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  959. "\xf0\x8f\xbf\xbf"));
  960. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  961. ConvertUTFResultContainer(sourceIllegal)
  962. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  963. "\xf8\x87\xbf\xbf\xbf"));
  964. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  965. ConvertUTFResultContainer(sourceIllegal)
  966. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  967. "\xfc\x83\xbf\xbf\xbf\xbf"));
  968. //
  969. // Isolated surrogates
  970. //
  971. // Unicode 6.3.0:
  972. //
  973. // D71. High-surrogate code point: A Unicode code point in the range
  974. // U+D800 to U+DBFF.
  975. //
  976. // D73. Low-surrogate code point: A Unicode code point in the range
  977. // U+DC00 to U+DFFF.
  978. // Note: U+E0100 is <DB40 DD00> in UTF16.
  979. // High surrogates
  980. // U+D800
  981. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  982. ConvertUTFResultContainer(sourceIllegal)
  983. .withScalars(0xfffd, 0xfffd, 0xfffd),
  984. "\xed\xa0\x80"));
  985. // U+DB40
  986. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  987. ConvertUTFResultContainer(sourceIllegal)
  988. .withScalars(0xfffd, 0xfffd, 0xfffd),
  989. "\xed\xac\xa0"));
  990. // U+DBFF
  991. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  992. ConvertUTFResultContainer(sourceIllegal)
  993. .withScalars(0xfffd, 0xfffd, 0xfffd),
  994. "\xed\xaf\xbf"));
  995. // Low surrogates
  996. // U+DC00
  997. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  998. ConvertUTFResultContainer(sourceIllegal)
  999. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1000. "\xed\xb0\x80"));
  1001. // U+DD00
  1002. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1003. ConvertUTFResultContainer(sourceIllegal)
  1004. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1005. "\xed\xb4\x80"));
  1006. // U+DFFF
  1007. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1008. ConvertUTFResultContainer(sourceIllegal)
  1009. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1010. "\xed\xbf\xbf"));
  1011. // Surrogate pairs
  1012. // U+D800 U+DC00
  1013. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1014. ConvertUTFResultContainer(sourceIllegal)
  1015. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1016. "\xed\xa0\x80\xed\xb0\x80"));
  1017. // U+D800 U+DD00
  1018. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1019. ConvertUTFResultContainer(sourceIllegal)
  1020. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1021. "\xed\xa0\x80\xed\xb4\x80"));
  1022. // U+D800 U+DFFF
  1023. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1024. ConvertUTFResultContainer(sourceIllegal)
  1025. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1026. "\xed\xa0\x80\xed\xbf\xbf"));
  1027. // U+DB40 U+DC00
  1028. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1029. ConvertUTFResultContainer(sourceIllegal)
  1030. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1031. "\xed\xac\xa0\xed\xb0\x80"));
  1032. // U+DB40 U+DD00
  1033. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1034. ConvertUTFResultContainer(sourceIllegal)
  1035. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1036. "\xed\xac\xa0\xed\xb4\x80"));
  1037. // U+DB40 U+DFFF
  1038. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1039. ConvertUTFResultContainer(sourceIllegal)
  1040. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1041. "\xed\xac\xa0\xed\xbf\xbf"));
  1042. // U+DBFF U+DC00
  1043. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1044. ConvertUTFResultContainer(sourceIllegal)
  1045. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1046. "\xed\xaf\xbf\xed\xb0\x80"));
  1047. // U+DBFF U+DD00
  1048. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1049. ConvertUTFResultContainer(sourceIllegal)
  1050. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1051. "\xed\xaf\xbf\xed\xb4\x80"));
  1052. // U+DBFF U+DFFF
  1053. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1054. ConvertUTFResultContainer(sourceIllegal)
  1055. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1056. "\xed\xaf\xbf\xed\xbf\xbf"));
  1057. //
  1058. // Noncharacters
  1059. //
  1060. // Unicode 6.3.0:
  1061. //
  1062. // D14. Noncharacter: A code point that is permanently reserved for
  1063. // internal use and that should never be interchanged. Noncharacters
  1064. // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
  1065. // and the values U+FDD0..U+FDEF.
  1066. // U+FFFE
  1067. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1068. ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
  1069. "\xef\xbf\xbe"));
  1070. // U+FFFF
  1071. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1072. ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
  1073. "\xef\xbf\xbf"));
  1074. // U+1FFFE
  1075. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1076. ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
  1077. "\xf0\x9f\xbf\xbe"));
  1078. // U+1FFFF
  1079. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1080. ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
  1081. "\xf0\x9f\xbf\xbf"));
  1082. // U+2FFFE
  1083. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1084. ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
  1085. "\xf0\xaf\xbf\xbe"));
  1086. // U+2FFFF
  1087. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1088. ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
  1089. "\xf0\xaf\xbf\xbf"));
  1090. // U+3FFFE
  1091. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1092. ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
  1093. "\xf0\xbf\xbf\xbe"));
  1094. // U+3FFFF
  1095. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1096. ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
  1097. "\xf0\xbf\xbf\xbf"));
  1098. // U+4FFFE
  1099. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1100. ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
  1101. "\xf1\x8f\xbf\xbe"));
  1102. // U+4FFFF
  1103. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1104. ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
  1105. "\xf1\x8f\xbf\xbf"));
  1106. // U+5FFFE
  1107. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1108. ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
  1109. "\xf1\x9f\xbf\xbe"));
  1110. // U+5FFFF
  1111. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1112. ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
  1113. "\xf1\x9f\xbf\xbf"));
  1114. // U+6FFFE
  1115. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1116. ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
  1117. "\xf1\xaf\xbf\xbe"));
  1118. // U+6FFFF
  1119. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1120. ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
  1121. "\xf1\xaf\xbf\xbf"));
  1122. // U+7FFFE
  1123. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1124. ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
  1125. "\xf1\xbf\xbf\xbe"));
  1126. // U+7FFFF
  1127. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1128. ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
  1129. "\xf1\xbf\xbf\xbf"));
  1130. // U+8FFFE
  1131. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1132. ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
  1133. "\xf2\x8f\xbf\xbe"));
  1134. // U+8FFFF
  1135. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1136. ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
  1137. "\xf2\x8f\xbf\xbf"));
  1138. // U+9FFFE
  1139. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1140. ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
  1141. "\xf2\x9f\xbf\xbe"));
  1142. // U+9FFFF
  1143. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1144. ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
  1145. "\xf2\x9f\xbf\xbf"));
  1146. // U+AFFFE
  1147. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1148. ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
  1149. "\xf2\xaf\xbf\xbe"));
  1150. // U+AFFFF
  1151. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1152. ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
  1153. "\xf2\xaf\xbf\xbf"));
  1154. // U+BFFFE
  1155. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1156. ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
  1157. "\xf2\xbf\xbf\xbe"));
  1158. // U+BFFFF
  1159. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1160. ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
  1161. "\xf2\xbf\xbf\xbf"));
  1162. // U+CFFFE
  1163. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1164. ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
  1165. "\xf3\x8f\xbf\xbe"));
  1166. // U+CFFFF
  1167. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1168. ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
  1169. "\xf3\x8f\xbf\xbf"));
  1170. // U+DFFFE
  1171. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1172. ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
  1173. "\xf3\x9f\xbf\xbe"));
  1174. // U+DFFFF
  1175. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1176. ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
  1177. "\xf3\x9f\xbf\xbf"));
  1178. // U+EFFFE
  1179. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1180. ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
  1181. "\xf3\xaf\xbf\xbe"));
  1182. // U+EFFFF
  1183. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1184. ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
  1185. "\xf3\xaf\xbf\xbf"));
  1186. // U+FFFFE
  1187. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1188. ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
  1189. "\xf3\xbf\xbf\xbe"));
  1190. // U+FFFFF
  1191. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1192. ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
  1193. "\xf3\xbf\xbf\xbf"));
  1194. // U+10FFFE
  1195. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1196. ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
  1197. "\xf4\x8f\xbf\xbe"));
  1198. // U+10FFFF
  1199. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1200. ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
  1201. "\xf4\x8f\xbf\xbf"));
  1202. // U+FDD0
  1203. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1204. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
  1205. "\xef\xb7\x90"));
  1206. // U+FDD1
  1207. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1208. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
  1209. "\xef\xb7\x91"));
  1210. // U+FDD2
  1211. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1212. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
  1213. "\xef\xb7\x92"));
  1214. // U+FDD3
  1215. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1216. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
  1217. "\xef\xb7\x93"));
  1218. // U+FDD4
  1219. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1220. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
  1221. "\xef\xb7\x94"));
  1222. // U+FDD5
  1223. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1224. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
  1225. "\xef\xb7\x95"));
  1226. // U+FDD6
  1227. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1228. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
  1229. "\xef\xb7\x96"));
  1230. // U+FDD7
  1231. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1232. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
  1233. "\xef\xb7\x97"));
  1234. // U+FDD8
  1235. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1236. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
  1237. "\xef\xb7\x98"));
  1238. // U+FDD9
  1239. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1240. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
  1241. "\xef\xb7\x99"));
  1242. // U+FDDA
  1243. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1244. ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
  1245. "\xef\xb7\x9a"));
  1246. // U+FDDB
  1247. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1248. ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
  1249. "\xef\xb7\x9b"));
  1250. // U+FDDC
  1251. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1252. ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
  1253. "\xef\xb7\x9c"));
  1254. // U+FDDD
  1255. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1256. ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
  1257. "\xef\xb7\x9d"));
  1258. // U+FDDE
  1259. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1260. ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
  1261. "\xef\xb7\x9e"));
  1262. // U+FDDF
  1263. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1264. ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
  1265. "\xef\xb7\x9f"));
  1266. // U+FDE0
  1267. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1268. ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
  1269. "\xef\xb7\xa0"));
  1270. // U+FDE1
  1271. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1272. ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
  1273. "\xef\xb7\xa1"));
  1274. // U+FDE2
  1275. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1276. ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
  1277. "\xef\xb7\xa2"));
  1278. // U+FDE3
  1279. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1280. ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
  1281. "\xef\xb7\xa3"));
  1282. // U+FDE4
  1283. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1284. ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
  1285. "\xef\xb7\xa4"));
  1286. // U+FDE5
  1287. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1288. ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
  1289. "\xef\xb7\xa5"));
  1290. // U+FDE6
  1291. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1292. ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
  1293. "\xef\xb7\xa6"));
  1294. // U+FDE7
  1295. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1296. ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
  1297. "\xef\xb7\xa7"));
  1298. // U+FDE8
  1299. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1300. ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
  1301. "\xef\xb7\xa8"));
  1302. // U+FDE9
  1303. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1304. ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
  1305. "\xef\xb7\xa9"));
  1306. // U+FDEA
  1307. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1308. ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
  1309. "\xef\xb7\xaa"));
  1310. // U+FDEB
  1311. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1312. ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
  1313. "\xef\xb7\xab"));
  1314. // U+FDEC
  1315. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1316. ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
  1317. "\xef\xb7\xac"));
  1318. // U+FDED
  1319. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1320. ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
  1321. "\xef\xb7\xad"));
  1322. // U+FDEE
  1323. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1324. ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
  1325. "\xef\xb7\xae"));
  1326. // U+FDEF
  1327. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1328. ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
  1329. "\xef\xb7\xaf"));
  1330. // U+FDF0
  1331. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1332. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
  1333. "\xef\xb7\xb0"));
  1334. // U+FDF1
  1335. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1336. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
  1337. "\xef\xb7\xb1"));
  1338. // U+FDF2
  1339. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1340. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
  1341. "\xef\xb7\xb2"));
  1342. // U+FDF3
  1343. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1344. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
  1345. "\xef\xb7\xb3"));
  1346. // U+FDF4
  1347. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1348. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
  1349. "\xef\xb7\xb4"));
  1350. // U+FDF5
  1351. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1352. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
  1353. "\xef\xb7\xb5"));
  1354. // U+FDF6
  1355. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1356. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
  1357. "\xef\xb7\xb6"));
  1358. // U+FDF7
  1359. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1360. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
  1361. "\xef\xb7\xb7"));
  1362. // U+FDF8
  1363. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1364. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
  1365. "\xef\xb7\xb8"));
  1366. // U+FDF9
  1367. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1368. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
  1369. "\xef\xb7\xb9"));
  1370. // U+FDFA
  1371. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1372. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
  1373. "\xef\xb7\xba"));
  1374. // U+FDFB
  1375. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1376. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
  1377. "\xef\xb7\xbb"));
  1378. // U+FDFC
  1379. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1380. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
  1381. "\xef\xb7\xbc"));
  1382. // U+FDFD
  1383. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1384. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
  1385. "\xef\xb7\xbd"));
  1386. // U+FDFE
  1387. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1388. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
  1389. "\xef\xb7\xbe"));
  1390. // U+FDFF
  1391. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1392. ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
  1393. "\xef\xb7\xbf"));
  1394. }
  1395. TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
  1396. // U+0041 LATIN CAPITAL LETTER A
  1397. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1398. ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
  1399. "\x41", true));
  1400. //
  1401. // Sequences with one continuation byte missing
  1402. //
  1403. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1404. ConvertUTFResultContainer(sourceExhausted),
  1405. "\xc2", true));
  1406. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1407. ConvertUTFResultContainer(sourceExhausted),
  1408. "\xdf", true));
  1409. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1410. ConvertUTFResultContainer(sourceExhausted),
  1411. "\xe0\xa0", true));
  1412. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1413. ConvertUTFResultContainer(sourceExhausted),
  1414. "\xe0\xbf", true));
  1415. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1416. ConvertUTFResultContainer(sourceExhausted),
  1417. "\xe1\x80", true));
  1418. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1419. ConvertUTFResultContainer(sourceExhausted),
  1420. "\xec\xbf", true));
  1421. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1422. ConvertUTFResultContainer(sourceExhausted),
  1423. "\xed\x80", true));
  1424. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1425. ConvertUTFResultContainer(sourceExhausted),
  1426. "\xed\x9f", true));
  1427. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1428. ConvertUTFResultContainer(sourceExhausted),
  1429. "\xee\x80", true));
  1430. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1431. ConvertUTFResultContainer(sourceExhausted),
  1432. "\xef\xbf", true));
  1433. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1434. ConvertUTFResultContainer(sourceExhausted),
  1435. "\xf0\x90\x80", true));
  1436. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1437. ConvertUTFResultContainer(sourceExhausted),
  1438. "\xf0\xbf\xbf", true));
  1439. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1440. ConvertUTFResultContainer(sourceExhausted),
  1441. "\xf1\x80\x80", true));
  1442. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1443. ConvertUTFResultContainer(sourceExhausted),
  1444. "\xf3\xbf\xbf", true));
  1445. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1446. ConvertUTFResultContainer(sourceExhausted),
  1447. "\xf4\x80\x80", true));
  1448. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1449. ConvertUTFResultContainer(sourceExhausted),
  1450. "\xf4\x8f\xbf", true));
  1451. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1452. ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
  1453. "\x41\xc2", true));
  1454. }