ConvertUTF.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*
  2. * Copyright 2001-2004 Unicode, Inc.
  3. *
  4. * Disclaimer
  5. *
  6. * This source code is provided as is by Unicode, Inc. No claims are
  7. * made as to fitness for any particular purpose. No warranties of any
  8. * kind are expressed or implied. The recipient agrees to determine
  9. * applicability of information provided. If this file has been
  10. * purchased on magnetic or optical media from Unicode, Inc., the
  11. * sole remedy for any claim will be exchange of defective media
  12. * within 90 days of receipt.
  13. *
  14. * Limitations on Rights to Redistribute This Code
  15. *
  16. * Unicode, Inc. hereby grants the right to freely use the information
  17. * supplied in this file in the creation of products supporting the
  18. * Unicode Standard, and to make copies of this file in any form
  19. * for internal or external distribution as long as this notice
  20. * remains attached.
  21. */
  22. /* ---------------------------------------------------------------------
  23. Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  24. Author: Mark E. Davis, 1994.
  25. Rev History: Rick McGowan, fixes & updates May 2001.
  26. Sept 2001: fixed const & error conditions per
  27. mods suggested by S. Parent & A. Lillich.
  28. June 2002: Tim Dodd added detection and handling of incomplete
  29. source sequences, enhanced error detection, added casts
  30. to eliminate compiler warnings.
  31. July 2003: slight mods to back out aggressive FFFE detection.
  32. Jan 2004: updated switches in from-UTF8 conversions.
  33. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  34. See the header file "ConvertUTF.h" for complete documentation.
  35. ------------------------------------------------------------------------ */
  36. #include "ConvertUTF.h"
  37. #ifdef CVTUTF_DEBUG
  38. #include <stdio.h>
  39. #endif
  40. static const int halfShift = 10; /* used for shifting by 10 bits */
  41. static const UTF32 halfBase = 0x0010000UL;
  42. static const UTF32 halfMask = 0x3FFUL;
  43. #define UNI_SUR_HIGH_START (UTF32)0xD800
  44. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  45. #define UNI_SUR_LOW_START (UTF32)0xDC00
  46. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  47. #define false 0
  48. #define true 1
  49. /* --------------------------------------------------------------------- */
  50. ConversionResult ConvertUTF32toUTF16 (
  51. const UTF32** sourceStart, const UTF32* sourceEnd,
  52. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  53. ConversionResult result = conversionOK;
  54. const UTF32* source = *sourceStart;
  55. UTF16* target = *targetStart;
  56. while (source < sourceEnd) {
  57. UTF32 ch;
  58. if (target >= targetEnd) {
  59. result = targetExhausted; break;
  60. }
  61. ch = *source++;
  62. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  63. /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  64. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  65. if (flags == strictConversion) {
  66. --source; /* return to the illegal value itself */
  67. result = sourceIllegal;
  68. break;
  69. } else {
  70. *target++ = UNI_REPLACEMENT_CHAR;
  71. }
  72. } else {
  73. *target++ = (UTF16)ch; /* normal case */
  74. }
  75. } else if (ch > UNI_MAX_LEGAL_UTF32) {
  76. if (flags == strictConversion) {
  77. result = sourceIllegal;
  78. } else {
  79. *target++ = UNI_REPLACEMENT_CHAR;
  80. }
  81. } else {
  82. /* target is a character in range 0xFFFF - 0x10FFFF. */
  83. if (target + 1 >= targetEnd) {
  84. --source; /* Back up source pointer! */
  85. result = targetExhausted; break;
  86. }
  87. ch -= halfBase;
  88. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  89. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  90. }
  91. }
  92. *sourceStart = source;
  93. *targetStart = target;
  94. return result;
  95. }
  96. /* --------------------------------------------------------------------- */
  97. ConversionResult ConvertUTF16toUTF32 (
  98. const UTF16** sourceStart, const UTF16* sourceEnd,
  99. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  100. ConversionResult result = conversionOK;
  101. const UTF16* source = *sourceStart;
  102. UTF32* target = *targetStart;
  103. UTF32 ch, ch2;
  104. while (source < sourceEnd) {
  105. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  106. ch = *source++;
  107. /* If we have a surrogate pair, convert to UTF32 first. */
  108. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  109. /* If the 16 bits following the high surrogate are in the source buffer... */
  110. if (source < sourceEnd) {
  111. ch2 = *source;
  112. /* If it's a low surrogate, convert to UTF32. */
  113. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  114. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  115. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  116. ++source;
  117. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  118. --source; /* return to the illegal value itself */
  119. result = sourceIllegal;
  120. break;
  121. }
  122. } else { /* We don't have the 16 bits following the high surrogate. */
  123. --source; /* return to the high surrogate */
  124. result = sourceExhausted;
  125. break;
  126. }
  127. } else if (flags == strictConversion) {
  128. /* UTF-16 surrogate values are illegal in UTF-32 */
  129. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  130. --source; /* return to the illegal value itself */
  131. result = sourceIllegal;
  132. break;
  133. }
  134. }
  135. if (target >= targetEnd) {
  136. source = oldSource; /* Back up source pointer! */
  137. result = targetExhausted; break;
  138. }
  139. *target++ = ch;
  140. }
  141. *sourceStart = source;
  142. *targetStart = target;
  143. #ifdef CVTUTF_DEBUG
  144. if (result == sourceIllegal) {
  145. fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
  146. fflush(stderr);
  147. }
  148. #endif
  149. return result;
  150. }
  151. /* --------------------------------------------------------------------- */
  152. /*
  153. * Index into the table below with the first byte of a UTF-8 sequence to
  154. * get the number of trailing bytes that are supposed to follow it.
  155. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  156. * left as-is for anyone who may want to do such conversion, which was
  157. * allowed in earlier algorithms.
  158. */
  159. static const char trailingBytesForUTF8[256] = {
  160. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  161. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  162. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  163. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  164. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  165. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  166. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  167. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  168. };
  169. /*
  170. * Magic values subtracted from a buffer value during UTF8 conversion.
  171. * This table contains as many values as there might be trailing bytes
  172. * in a UTF-8 sequence.
  173. */
  174. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  175. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  176. /*
  177. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  178. * into the first byte, depending on how many bytes follow. There are
  179. * as many entries in this table as there are UTF-8 sequence types.
  180. * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  181. * for *legal* UTF-8 will be 4 or fewer bytes total.
  182. */
  183. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  184. /* --------------------------------------------------------------------- */
  185. /* The interface converts a whole buffer to avoid function-call overhead.
  186. * Constants have been gathered. Loops & conditionals have been removed as
  187. * much as possible for efficiency, in favor of drop-through switches.
  188. * (See "Note A" at the bottom of the file for equivalent code.)
  189. * If your compiler supports it, the "isLegalUTF8" call can be turned
  190. * into an inline function.
  191. */
  192. /* --------------------------------------------------------------------- */
  193. ConversionResult ConvertUTF16toUTF8 (
  194. const UTF16** sourceStart, const UTF16* sourceEnd,
  195. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  196. ConversionResult result = conversionOK;
  197. const UTF16* source = *sourceStart;
  198. UTF8* target = *targetStart;
  199. while (source < sourceEnd) {
  200. UTF32 ch;
  201. unsigned short bytesToWrite = 0;
  202. const UTF32 byteMask = 0xBF;
  203. const UTF32 byteMark = 0x80;
  204. const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
  205. ch = *source++;
  206. /* If we have a surrogate pair, convert to UTF32 first. */
  207. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
  208. /* If the 16 bits following the high surrogate are in the source buffer... */
  209. if (source < sourceEnd) {
  210. UTF32 ch2 = *source;
  211. /* If it's a low surrogate, convert to UTF32. */
  212. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
  213. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  214. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  215. ++source;
  216. } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
  217. --source; /* return to the illegal value itself */
  218. result = sourceIllegal;
  219. break;
  220. }
  221. } else { /* We don't have the 16 bits following the high surrogate. */
  222. --source; /* return to the high surrogate */
  223. result = sourceExhausted;
  224. break;
  225. }
  226. } else if (flags == strictConversion) {
  227. /* UTF-16 surrogate values are illegal in UTF-32 */
  228. if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
  229. --source; /* return to the illegal value itself */
  230. result = sourceIllegal;
  231. break;
  232. }
  233. }
  234. /* Figure out how many bytes the result will require */
  235. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  236. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  237. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  238. } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
  239. } else { bytesToWrite = 3;
  240. ch = UNI_REPLACEMENT_CHAR;
  241. }
  242. target += bytesToWrite;
  243. if (target > targetEnd) {
  244. source = oldSource; /* Back up source pointer! */
  245. target -= bytesToWrite; result = targetExhausted; break;
  246. }
  247. switch (bytesToWrite) { /* note: everything falls through. */
  248. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  249. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  250. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  251. case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
  252. }
  253. target += bytesToWrite;
  254. }
  255. *sourceStart = source;
  256. *targetStart = target;
  257. return result;
  258. }
  259. /* --------------------------------------------------------------------- */
  260. /*
  261. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  262. * This must be called with the length pre-determined by the first byte.
  263. * If not calling this from ConvertUTF8to*, then the length can be set by:
  264. * length = trailingBytesForUTF8[*source]+1;
  265. * and the sequence is illegal right away if there aren't that many bytes
  266. * available.
  267. * If presented with a length > 4, this returns false. The Unicode
  268. * definition of UTF-8 goes up to 4-byte sequences.
  269. */
  270. static Boolean isLegalUTF8(const UTF8 *source, int length) {
  271. UTF8 a;
  272. const UTF8 *srcptr = source+length;
  273. switch (length) {
  274. default: return false;
  275. /* Everything else falls through when "true"... */
  276. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  277. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  278. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  279. switch (*source) {
  280. /* no fall-through in this inner switch */
  281. case 0xE0: if (a < 0xA0) return false; break;
  282. case 0xED: if (a > 0x9F) return false; break;
  283. case 0xF0: if (a < 0x90) return false; break;
  284. case 0xF4: if (a > 0x8F) return false; break;
  285. default: if (a < 0x80) return false;
  286. }
  287. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  288. }
  289. if (*source > 0xF4) return false;
  290. return true;
  291. }
  292. /* --------------------------------------------------------------------- */
  293. /*
  294. * Exported function to return whether a UTF-8 sequence is legal or not.
  295. * This is not used here; it's just exported.
  296. */
  297. Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
  298. int length = trailingBytesForUTF8[*source]+1;
  299. if (source+length > sourceEnd) {
  300. return false;
  301. }
  302. return isLegalUTF8(source, length);
  303. }
  304. /* --------------------------------------------------------------------- */
  305. ConversionResult ConvertUTF8toUTF16 (
  306. const UTF8** sourceStart, const UTF8* sourceEnd,
  307. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  308. ConversionResult result = conversionOK;
  309. const UTF8* source = *sourceStart;
  310. UTF16* target = *targetStart;
  311. while (source < sourceEnd) {
  312. UTF32 ch = 0;
  313. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  314. if (source + extraBytesToRead >= sourceEnd) {
  315. result = sourceExhausted; break;
  316. }
  317. /* Do this check whether lenient or strict */
  318. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  319. result = sourceIllegal;
  320. break;
  321. }
  322. /*
  323. * The cases all fall through. See "Note A" below.
  324. */
  325. switch (extraBytesToRead) {
  326. case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  327. case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
  328. case 3: ch += *source++; ch <<= 6;
  329. case 2: ch += *source++; ch <<= 6;
  330. case 1: ch += *source++; ch <<= 6;
  331. case 0: ch += *source++;
  332. }
  333. ch -= offsetsFromUTF8[extraBytesToRead];
  334. if (target >= targetEnd) {
  335. source -= (extraBytesToRead+1); /* Back up source pointer! */
  336. result = targetExhausted; break;
  337. }
  338. if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  339. /* UTF-16 surrogate values are illegal in UTF-32 */
  340. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  341. if (flags == strictConversion) {
  342. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  343. result = sourceIllegal;
  344. break;
  345. } else {
  346. *target++ = UNI_REPLACEMENT_CHAR;
  347. }
  348. } else {
  349. *target++ = (UTF16)ch; /* normal case */
  350. }
  351. } else if (ch > UNI_MAX_UTF16) {
  352. if (flags == strictConversion) {
  353. result = sourceIllegal;
  354. source -= (extraBytesToRead+1); /* return to the start */
  355. break; /* Bail out; shouldn't continue */
  356. } else {
  357. *target++ = UNI_REPLACEMENT_CHAR;
  358. }
  359. } else {
  360. /* target is a character in range 0xFFFF - 0x10FFFF. */
  361. if (target + 1 >= targetEnd) {
  362. source -= (extraBytesToRead+1); /* Back up source pointer! */
  363. result = targetExhausted; break;
  364. }
  365. ch -= halfBase;
  366. *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
  367. *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
  368. }
  369. }
  370. *sourceStart = source;
  371. *targetStart = target;
  372. return result;
  373. }
  374. /* --------------------------------------------------------------------- */
  375. ConversionResult ConvertUTF32toUTF8 (
  376. const UTF32** sourceStart, const UTF32* sourceEnd,
  377. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
  378. ConversionResult result = conversionOK;
  379. const UTF32* source = *sourceStart;
  380. UTF8* target = *targetStart;
  381. while (source < sourceEnd) {
  382. UTF32 ch;
  383. unsigned short bytesToWrite = 0;
  384. const UTF32 byteMask = 0xBF;
  385. const UTF32 byteMark = 0x80;
  386. ch = *source++;
  387. if (flags == strictConversion ) {
  388. /* UTF-16 surrogate values are illegal in UTF-32 */
  389. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  390. --source; /* return to the illegal value itself */
  391. result = sourceIllegal;
  392. break;
  393. }
  394. }
  395. /*
  396. * Figure out how many bytes the result will require. Turn any
  397. * illegally large UTF32 things (> Plane 17) into replacement chars.
  398. */
  399. if (ch < (UTF32)0x80) { bytesToWrite = 1;
  400. } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
  401. } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
  402. } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
  403. } else { bytesToWrite = 3;
  404. ch = UNI_REPLACEMENT_CHAR;
  405. result = sourceIllegal;
  406. }
  407. target += bytesToWrite;
  408. if (target > targetEnd) {
  409. --source; /* Back up source pointer! */
  410. target -= bytesToWrite; result = targetExhausted; break;
  411. }
  412. switch (bytesToWrite) { /* note: everything falls through. */
  413. case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  414. case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  415. case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
  416. case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
  417. }
  418. target += bytesToWrite;
  419. }
  420. *sourceStart = source;
  421. *targetStart = target;
  422. return result;
  423. }
  424. /* --------------------------------------------------------------------- */
  425. ConversionResult ConvertUTF8toUTF32 (
  426. const UTF8** sourceStart, const UTF8* sourceEnd,
  427. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  428. ConversionResult result = conversionOK;
  429. const UTF8* source = *sourceStart;
  430. UTF32* target = *targetStart;
  431. while (source < sourceEnd) {
  432. UTF32 ch = 0;
  433. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  434. if (source + extraBytesToRead >= sourceEnd) {
  435. result = sourceExhausted; break;
  436. }
  437. /* Do this check whether lenient or strict */
  438. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  439. result = sourceIllegal;
  440. break;
  441. }
  442. /*
  443. * The cases all fall through. See "Note A" below.
  444. */
  445. switch (extraBytesToRead) {
  446. case 5: ch += *source++; ch <<= 6;
  447. case 4: ch += *source++; ch <<= 6;
  448. case 3: ch += *source++; ch <<= 6;
  449. case 2: ch += *source++; ch <<= 6;
  450. case 1: ch += *source++; ch <<= 6;
  451. case 0: ch += *source++;
  452. }
  453. ch -= offsetsFromUTF8[extraBytesToRead];
  454. if (target >= targetEnd) {
  455. source -= (extraBytesToRead+1); /* Back up the source pointer! */
  456. result = targetExhausted; break;
  457. }
  458. if (ch <= UNI_MAX_LEGAL_UTF32) {
  459. /*
  460. * UTF-16 surrogate values are illegal in UTF-32, and anything
  461. * over Plane 17 (> 0x10FFFF) is illegal.
  462. */
  463. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  464. if (flags == strictConversion) {
  465. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  466. result = sourceIllegal;
  467. break;
  468. } else {
  469. *target++ = UNI_REPLACEMENT_CHAR;
  470. }
  471. } else {
  472. *target++ = ch;
  473. }
  474. } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  475. result = sourceIllegal;
  476. *target++ = UNI_REPLACEMENT_CHAR;
  477. }
  478. }
  479. *sourceStart = source;
  480. *targetStart = target;
  481. return result;
  482. }
  483. /* ---------------------------------------------------------------------
  484. Note A.
  485. The fall-through switches in UTF-8 reading code save a
  486. temp variable, some decrements & conditionals. The switches
  487. are equivalent to the following loop:
  488. {
  489. int tmpBytesToRead = extraBytesToRead+1;
  490. do {
  491. ch += *source++;
  492. --tmpBytesToRead;
  493. if (tmpBytesToRead) ch <<= 6;
  494. } while (tmpBytesToRead > 0);
  495. }
  496. In UTF-8 writing code, the switches on "bytesToWrite" are
  497. similarly unrolled loops.
  498. --------------------------------------------------------------------- */