2
0

ConvertUTF.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709
  1. /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
  2. *
  3. * The LLVM Compiler Infrastructure
  4. *
  5. * This file is distributed under the University of Illinois Open Source
  6. * License. See LICENSE.TXT for details.
  7. *
  8. *===------------------------------------------------------------------------=*/
  9. /*
  10. * Copyright 2001-2004 Unicode, Inc.
  11. *
  12. * Disclaimer
  13. *
  14. * This source code is provided as is by Unicode, Inc. No claims are
  15. * made as to fitness for any particular purpose. No warranties of any
  16. * kind are expressed or implied. The recipient agrees to determine
  17. * applicability of information provided. If this file has been
  18. * purchased on magnetic or optical media from Unicode, Inc., the
  19. * sole remedy for any claim will be exchange of defective media
  20. * within 90 days of receipt.
  21. *
  22. * Limitations on Rights to Redistribute This Code
  23. *
  24. * Unicode, Inc. hereby grants the right to freely use the information
  25. * supplied in this file in the creation of products supporting the
  26. * Unicode Standard, and to make copies of this file in any form
  27. * for internal or external distribution as long as this notice
  28. * remains attached.
  29. */
  30. /* ---------------------------------------------------------------------
  31. Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  32. Author: Mark E. Davis, 1994.
  33. Rev History: Rick McGowan, fixes & updates May 2001.
  34. Sept 2001: fixed const & error conditions per
  35. mods suggested by S. Parent & A. Lillich.
  36. June 2002: Tim Dodd added detection and handling of incomplete
  37. source sequences, enhanced error detection, added casts
  38. to eliminate compiler warnings.
  39. July 2003: slight mods to back out aggressive FFFE detection.
  40. Jan 2004: updated switches in from-UTF8 conversions.
  41. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  42. See the header file "ConvertUTF.h" for complete documentation.
  43. ------------------------------------------------------------------------ */
  44. #include "llvm/Support/ConvertUTF.h"
  45. #ifdef CVTUTF_DEBUG
  46. #include <stdio.h>
  47. #endif
  48. #include <assert.h>
  49. static const int halfShift = 10; /* used for shifting by 10 bits */
  50. static const UTF32 halfBase = 0x0010000UL;
  51. static const UTF32 halfMask = 0x3FFUL;
  52. #define UNI_SUR_HIGH_START (UTF32)0xD800
  53. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  54. #define UNI_SUR_LOW_START (UTF32)0xDC00
  55. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  56. #define false 0
  57. #define true 1
  58. /* --------------------------------------------------------------------- */
  59. /*
  60. * Index into the table below with the first byte of a UTF-8 sequence to
  61. * get the number of trailing bytes that are supposed to follow it.
  62. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  63. * left as-is for anyone who may want to do such conversion, which was
  64. * allowed in earlier algorithms.
  65. */
  66. static const char trailingBytesForUTF8[256] = {
  67. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  68. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  69. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  70. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  71. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  72. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  73. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  74. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  75. };
  76. /*
  77. * Magic values subtracted from a buffer value during UTF8 conversion.
  78. * This table contains as many values as there might be trailing bytes
  79. * in a UTF-8 sequence.
  80. */
  81. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  82. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  83. /*
  84. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  85. * into the first byte, depending on how many bytes follow. There are
  86. * as many entries in this table as there are UTF-8 sequence types.
  87. * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  88. * for *legal* UTF-8 will be 4 or fewer bytes total.
  89. */
  90. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  91. /* --------------------------------------------------------------------- */
  92. /* The interface converts a whole buffer to avoid function-call overhead.
  93. * Constants have been gathered. Loops & conditionals have been removed as
  94. * much as possible for efficiency, in favor of drop-through switches.
  95. * (See "Note A" at the bottom of the file for equivalent code.)
  96. * If your compiler supports it, the "isLegalUTF8" call can be turned
  97. * into an inline function.
  98. */
  99. /* --------------------------------------------------------------------- */
  100. ConversionResult ConvertUTF32toUTF16 (
  101. const UTF32** sourceStart, const UTF32* sourceEnd,
  102. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  103. ConversionResult result = conversionOK;
  104. const UTF32* source = *sourceStart;
  105. UTF16* target = *targetStart;
  106. while (source < sourceEnd) {
  107. UTF32 ch;
  108. if (target >= targetEnd) {
  109. result = targetExhausted; break;
  110. }
  111. ch = *source++;
  112. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  113. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  114. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  115. if (flags == strictConversion) {
  116. --source; /* return to the illegal value itself */
  117. result = sourceIllegal;
  118. break;
  119. } else {
  120. *target++ = UNI_REPLACEMENT_CHAR;
  121. }
  122. } else {
  123. *target++ = (UTF16)ch; /* normal case */
  124. }
  125. } else if (ch > UNI_MAX_LEGAL_UTF32) {
  126. if (flags == strictConversion) {
  127. result = sourceIllegal;
  128. } else {
  129. *target++ = UNI_REPLACEMENT_CHAR;
  130. }
  131. } else {
  132. /* target is a character in range 0xFFFF - 0x10FFFF. */
  133. if (target + 1 >= targetEnd) {
  134. --source; /* Back up source pointer! */
  135. result = targetExhausted; break;
  136. }
  137. ch -= halfBase;
  138. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  139. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  140. }
  141. }
  142. *sourceStart = source;
  143. *targetStart = target;
  144. return result;
  145. }
  146. /* --------------------------------------------------------------------- */
  147. ConversionResult ConvertUTF16toUTF32 (
  148. const UTF16** sourceStart, const UTF16* sourceEnd,
  149. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  150. ConversionResult result = conversionOK;
  151. const UTF16* source = *sourceStart;
  152. UTF32* target = *targetStart;
  153. UTF32 ch, ch2;
  154. while (source < sourceEnd) {
  155. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  156. ch = *source++;
  157. /* If we have a surrogate pair, convert to UTF32 first. */
  158. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  159. /* If the 16 bits following the high surrogate are in the source buffer... */
  160. if (source < sourceEnd) {
  161. ch2 = *source;
  162. /* If it's a low surrogate, convert to UTF32. */
  163. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  164. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  165. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  166. ++source;
  167. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  168. --source; /* return to the illegal value itself */
  169. result = sourceIllegal;
  170. break;
  171. }
  172. } else { /* We don't have the 16 bits following the high surrogate. */
  173. --source; /* return to the high surrogate */
  174. result = sourceExhausted;
  175. break;
  176. }
  177. } else if (flags == strictConversion) {
  178. /* UTF-16 surrogate values are illegal in UTF-32 */
  179. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  180. --source; /* return to the illegal value itself */
  181. result = sourceIllegal;
  182. break;
  183. }
  184. }
  185. if (target >= targetEnd) {
  186. source = oldSource; /* Back up source pointer! */
  187. result = targetExhausted; break;
  188. }
  189. *target++ = ch;
  190. }
  191. *sourceStart = source;
  192. *targetStart = target;
  193. #ifdef CVTUTF_DEBUG
  194. if (result == sourceIllegal) {
  195. fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
  196. fflush(stderr);
  197. }
  198. #endif
  199. return result;
  200. }
  201. ConversionResult ConvertUTF16toUTF8 (
  202. const UTF16** sourceStart, const UTF16* sourceEnd,
  203. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  204. ConversionResult result = conversionOK;
  205. const UTF16* source = *sourceStart;
  206. UTF8* target = *targetStart;
  207. while (source < sourceEnd) {
  208. UTF32 ch;
  209. unsigned short bytesToWrite = 0;
  210. const UTF32 byteMask = 0xBF;
  211. const UTF32 byteMark = 0x80;
  212. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  213. ch = *source++;
  214. /* If we have a surrogate pair, convert to UTF32 first. */
  215. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  216. /* If the 16 bits following the high surrogate are in the source buffer... */
  217. if (source < sourceEnd) {
  218. UTF32 ch2 = *source;
  219. /* If it's a low surrogate, convert to UTF32. */
  220. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  221. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  222. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  223. ++source;
  224. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  225. --source; /* return to the illegal value itself */
  226. result = sourceIllegal;
  227. break;
  228. }
  229. } else { /* We don't have the 16 bits following the high surrogate. */
  230. --source; /* return to the high surrogate */
  231. result = sourceExhausted;
  232. break;
  233. }
  234. } else if (flags == strictConversion) {
  235. /* UTF-16 surrogate values are illegal in UTF-32 */
  236. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  237. --source; /* return to the illegal value itself */
  238. result = sourceIllegal;
  239. break;
  240. }
  241. }
  242. /* Figure out how many bytes the result will require */
  243. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  244. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  245. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  246. } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
  247. } else { bytesToWrite = 3;
  248. ch = UNI_REPLACEMENT_CHAR;
  249. }
  250. target += bytesToWrite;
  251. if (target > targetEnd) {
  252. source = oldSource; /* Back up source pointer! */
  253. target -= bytesToWrite; result = targetExhausted; break;
  254. }
  255. switch (bytesToWrite) { /* note: everything falls through. */
  256. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  257. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  258. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  259. case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
  260. }
  261. target += bytesToWrite;
  262. }
  263. *sourceStart = source;
  264. *targetStart = target;
  265. return result;
  266. }
  267. /* --------------------------------------------------------------------- */
  268. ConversionResult ConvertUTF32toUTF8 (
  269. const UTF32** sourceStart, const UTF32* sourceEnd,
  270. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  271. ConversionResult result = conversionOK;
  272. const UTF32* source = *sourceStart;
  273. UTF8* target = *targetStart;
  274. while (source < sourceEnd) {
  275. UTF32 ch;
  276. unsigned short bytesToWrite = 0;
  277. const UTF32 byteMask = 0xBF;
  278. const UTF32 byteMark = 0x80;
  279. ch = *source++;
  280. if (flags == strictConversion ) {
  281. /* UTF-16 surrogate values are illegal in UTF-32 */
  282. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  283. --source; /* return to the illegal value itself */
  284. result = sourceIllegal;
  285. break;
  286. }
  287. }
  288. /*
  289. * Figure out how many bytes the result will require. Turn any
  290. * illegally large UTF32 things (> Plane 17) into replacement chars.
  291. */
  292. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  293. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  294. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  295. } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
  296. } else { bytesToWrite = 3;
  297. ch = UNI_REPLACEMENT_CHAR;
  298. result = sourceIllegal;
  299. }
  300. target += bytesToWrite;
  301. if (target > targetEnd) {
  302. --source; /* Back up source pointer! */
  303. target -= bytesToWrite; result = targetExhausted; break;
  304. }
  305. switch (bytesToWrite) { /* note: everything falls through. */
  306. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  307. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  308. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  309. case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
  310. }
  311. target += bytesToWrite;
  312. }
  313. *sourceStart = source;
  314. *targetStart = target;
  315. return result;
  316. }
  317. /* --------------------------------------------------------------------- */
  318. /*
  319. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  320. * This must be called with the length pre-determined by the first byte.
  321. * If not calling this from ConvertUTF8to*, then the length can be set by:
  322. * length = trailingBytesForUTF8[*source]+1;
  323. * and the sequence is illegal right away if there aren't that many bytes
  324. * available.
  325. * If presented with a length > 4, this returns false. The Unicode
  326. * definition of UTF-8 goes up to 4-byte sequences.
  327. */
  328. static Boolean isLegalUTF8(const UTF8 *source, int length) {
  329. UTF8 a;
  330. const UTF8 *srcptr = source+length;
  331. switch (length) {
  332. default: return false;
  333. /* Everything else falls through when "true"... */
  334. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  335. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  336. case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  337. switch (*source) {
  338. /* no fall-through in this inner switch */
  339. case 0xE0: if (a < 0xA0) return false; break;
  340. case 0xED: if (a > 0x9F) return false; break;
  341. case 0xF0: if (a < 0x90) return false; break;
  342. case 0xF4: if (a > 0x8F) return false; break;
  343. default: if (a < 0x80) return false;
  344. }
  345. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  346. }
  347. if (*source > 0xF4) return false;
  348. return true;
  349. }
  350. /* --------------------------------------------------------------------- */
  351. /*
  352. * Exported function to return whether a UTF-8 sequence is legal or not.
  353. * This is not used here; it's just exported.
  354. */
  355. Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
  356. int length = trailingBytesForUTF8[*source]+1;
  357. if (length > sourceEnd - source) {
  358. return false;
  359. }
  360. return isLegalUTF8(source, length);
  361. }
  362. /* --------------------------------------------------------------------- */
  363. static unsigned
  364. findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
  365. const UTF8 *sourceEnd) {
  366. UTF8 b1, b2, b3;
  367. assert(!isLegalUTF8Sequence(source, sourceEnd));
  368. /*
  369. * Unicode 6.3.0, D93b:
  370. *
  371. * Maximal subpart of an ill-formed subsequence: The longest code unit
  372. * subsequence starting at an unconvertible offset that is either:
  373. * a. the initial subsequence of a well-formed code unit sequence, or
  374. * b. a subsequence of length one.
  375. */
  376. if (source == sourceEnd)
  377. return 0;
  378. /*
  379. * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
  380. * Byte Sequences.
  381. */
  382. b1 = *source;
  383. ++source;
  384. if (b1 >= 0xC2 && b1 <= 0xDF) {
  385. /*
  386. * First byte is valid, but we know that this code unit sequence is
  387. * invalid, so the maximal subpart has to end after the first byte.
  388. */
  389. return 1;
  390. }
  391. if (source == sourceEnd)
  392. return 1;
  393. b2 = *source;
  394. ++source;
  395. if (b1 == 0xE0) {
  396. return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
  397. }
  398. if (b1 >= 0xE1 && b1 <= 0xEC) {
  399. return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
  400. }
  401. if (b1 == 0xED) {
  402. return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
  403. }
  404. if (b1 >= 0xEE && b1 <= 0xEF) {
  405. return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
  406. }
  407. if (b1 == 0xF0) {
  408. if (b2 >= 0x90 && b2 <= 0xBF) {
  409. if (source == sourceEnd)
  410. return 2;
  411. b3 = *source;
  412. return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
  413. }
  414. return 1;
  415. }
  416. if (b1 >= 0xF1 && b1 <= 0xF3) {
  417. if (b2 >= 0x80 && b2 <= 0xBF) {
  418. if (source == sourceEnd)
  419. return 2;
  420. b3 = *source;
  421. return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
  422. }
  423. return 1;
  424. }
  425. if (b1 == 0xF4) {
  426. if (b2 >= 0x80 && b2 <= 0x8F) {
  427. if (source == sourceEnd)
  428. return 2;
  429. b3 = *source;
  430. return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
  431. }
  432. return 1;
  433. }
  434. assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
  435. /*
  436. * There are no valid sequences that start with these bytes. Maximal subpart
  437. * is defined to have length 1 in these cases.
  438. */
  439. return 1;
  440. }
  441. /* --------------------------------------------------------------------- */
  442. /*
  443. * Exported function to return the total number of bytes in a codepoint
  444. * represented in UTF-8, given the value of the first byte.
  445. */
  446. unsigned getNumBytesForUTF8(UTF8 first) {
  447. return trailingBytesForUTF8[first] + 1;
  448. }
  449. /* --------------------------------------------------------------------- */
  450. /*
  451. * Exported function to return whether a UTF-8 string is legal or not.
  452. * This is not used here; it's just exported.
  453. */
  454. Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
  455. while (*source != sourceEnd) {
  456. int length = trailingBytesForUTF8[**source] + 1;
  457. if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
  458. return false;
  459. *source += length;
  460. }
  461. return true;
  462. }
  463. /* --------------------------------------------------------------------- */
  464. ConversionResult ConvertUTF8toUTF16 (
  465. const UTF8** sourceStart, const UTF8* sourceEnd,
  466. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  467. ConversionResult result = conversionOK;
  468. const UTF8* source = *sourceStart;
  469. UTF16* target = *targetStart;
  470. while (source < sourceEnd) {
  471. UTF32 ch = 0;
  472. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  473. if (extraBytesToRead >= sourceEnd - source) {
  474. result = sourceExhausted; break;
  475. }
  476. /* Do this check whether lenient or strict */
  477. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  478. result = sourceIllegal;
  479. break;
  480. }
  481. /*
  482. * The cases all fall through. See "Note A" below.
  483. */
  484. switch (extraBytesToRead) {
  485. case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  486. case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  487. case 3: ch += *source++; ch <<= 6;
  488. case 2: ch += *source++; ch <<= 6;
  489. case 1: ch += *source++; ch <<= 6;
  490. case 0: ch += *source++;
  491. }
  492. ch -= offsetsFromUTF8[extraBytesToRead];
  493. if (target >= targetEnd) {
  494. source -= (extraBytesToRead+1); /* Back up source pointer! */
  495. result = targetExhausted; break;
  496. }
  497. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  498. /* UTF-16 surrogate values are illegal in UTF-32 */
  499. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  500. if (flags == strictConversion) {
  501. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  502. result = sourceIllegal;
  503. break;
  504. } else {
  505. *target++ = UNI_REPLACEMENT_CHAR;
  506. }
  507. } else {
  508. *target++ = (UTF16)ch; /* normal case */
  509. }
  510. } else if (ch > UNI_MAX_UTF16) {
  511. if (flags == strictConversion) {
  512. result = sourceIllegal;
  513. source -= (extraBytesToRead+1); /* return to the start */
  514. break; /* Bail out; shouldn't continue */
  515. } else {
  516. *target++ = UNI_REPLACEMENT_CHAR;
  517. }
  518. } else {
  519. /* target is a character in range 0xFFFF - 0x10FFFF. */
  520. if (target + 1 >= targetEnd) {
  521. source -= (extraBytesToRead+1); /* Back up source pointer! */
  522. result = targetExhausted; break;
  523. }
  524. ch -= halfBase;
  525. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  526. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  527. }
  528. }
  529. *sourceStart = source;
  530. *targetStart = target;
  531. return result;
  532. }
  533. /* --------------------------------------------------------------------- */
  534. static ConversionResult ConvertUTF8toUTF32Impl(
  535. const UTF8** sourceStart, const UTF8* sourceEnd,
  536. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
  537. Boolean InputIsPartial) {
  538. ConversionResult result = conversionOK;
  539. const UTF8* source = *sourceStart;
  540. UTF32* target = *targetStart;
  541. while (source < sourceEnd) {
  542. UTF32 ch = 0;
  543. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  544. if (extraBytesToRead >= sourceEnd - source) {
  545. if (flags == strictConversion || InputIsPartial) {
  546. result = sourceExhausted;
  547. break;
  548. } else {
  549. result = sourceIllegal;
  550. /*
  551. * Replace the maximal subpart of ill-formed sequence with
  552. * replacement character.
  553. */
  554. source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
  555. sourceEnd);
  556. *target++ = UNI_REPLACEMENT_CHAR;
  557. continue;
  558. }
  559. }
  560. if (target >= targetEnd) {
  561. result = targetExhausted; break;
  562. }
  563. /* Do this check whether lenient or strict */
  564. if (!isLegalUTF8(source, extraBytesToRead+1)) {
  565. result = sourceIllegal;
  566. if (flags == strictConversion) {
  567. /* Abort conversion. */
  568. break;
  569. } else {
  570. /*
  571. * Replace the maximal subpart of ill-formed sequence with
  572. * replacement character.
  573. */
  574. source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
  575. sourceEnd);
  576. *target++ = UNI_REPLACEMENT_CHAR;
  577. continue;
  578. }
  579. }
  580. /*
  581. * The cases all fall through. See "Note A" below.
  582. */
  583. switch (extraBytesToRead) {
  584. case 5: ch += *source++; ch <<= 6;
  585. case 4: ch += *source++; ch <<= 6;
  586. case 3: ch += *source++; ch <<= 6;
  587. case 2: ch += *source++; ch <<= 6;
  588. case 1: ch += *source++; ch <<= 6;
  589. case 0: ch += *source++;
  590. }
  591. ch -= offsetsFromUTF8[extraBytesToRead];
  592. if (ch <= UNI_MAX_LEGAL_UTF32) {
  593. /*
  594. * UTF-16 surrogate values are illegal in UTF-32, and anything
  595. * over Plane 17 (> 0x10FFFF) is illegal.
  596. */
  597. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  598. if (flags == strictConversion) {
  599. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  600. result = sourceIllegal;
  601. break;
  602. } else {
  603. *target++ = UNI_REPLACEMENT_CHAR;
  604. }
  605. } else {
  606. *target++ = ch;
  607. }
  608. } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  609. result = sourceIllegal;
  610. *target++ = UNI_REPLACEMENT_CHAR;
  611. }
  612. }
  613. *sourceStart = source;
  614. *targetStart = target;
  615. return result;
  616. }
  617. ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
  618. const UTF8 *sourceEnd,
  619. UTF32 **targetStart,
  620. UTF32 *targetEnd,
  621. ConversionFlags flags) {
  622. return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
  623. flags, /*InputIsPartial=*/true);
  624. }
  625. ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
  626. const UTF8 *sourceEnd, UTF32 **targetStart,
  627. UTF32 *targetEnd, ConversionFlags flags) {
  628. return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
  629. flags, /*InputIsPartial=*/false);
  630. }
  631. /* ---------------------------------------------------------------------
  632. Note A.
  633. The fall-through switches in UTF-8 reading code save a
  634. temp variable, some decrements & conditionals. The switches
  635. are equivalent to the following loop:
  636. {
  637. int tmpBytesToRead = extraBytesToRead+1;
  638. do {
  639. ch += *source++;
  640. --tmpBytesToRead;
  641. if (tmpBytesToRead) ch <<= 6;
  642. } while (tmpBytesToRead > 0);
  643. }
  644. In UTF-8 writing code, the switches on "bytesToWrite" are
  645. similarly unrolled loops.
  646. --------------------------------------------------------------------- */