UTF8.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. #pragma warning(disable:4333)
  2. /*
  3. Basic UTF-8 manipulation routines
  4. by Jeff Bezanson
  5. placed in the public domain Fall 2005
  6. This code is designed to provide the utilities you need to manipulate
  7. UTF-8 as an internal string encoding. These functions do not perform the
  8. error checking normally needed when handling UTF-8 data, so if you happen
  9. to be from the Unicode Consortium you will want to flay me alive.
  10. I do this because error checking can be performed at the boundaries (I/O),
  11. with these routines reserved for higher performance on data known to be
  12. valid.
  13. */
  14. #include <stdlib.h>
  15. #include <stdio.h>
  16. #include <string.h>
  17. #include <stdarg.h>
  18. #ifdef WIN32
  19. #include <malloc.h>
  20. #else
  21. //#include <alloca.h>
  22. #endif
  23. #include "util/UTF8.h"
  24. USING_NS_BF;
  25. static const uint32 offsetsFromUTF8[6] = {
  26. 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  27. 0x03C82080UL, 0xFA082080UL, 0x82082080UL
  28. };
  29. static const char trailingBytesForUTF8[256] = {
  30. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  31. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  32. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  33. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  34. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  35. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  36. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  37. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  38. };
  39. /* returns length of next utf-8 sequence */
  40. int Beefy::u8_seqlen(char *s)
  41. {
  42. return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
  43. }
  44. int Beefy::u8_seqlen(uint32 ch)
  45. {
  46. if (ch < 0x80) {
  47. return 1;
  48. }
  49. else if (ch < 0x800) {
  50. return 2;
  51. }
  52. else if (ch < 0x10000) {
  53. return 3;
  54. }
  55. else if (ch < 0x110000) {
  56. return 4;
  57. }
  58. return 5;
  59. }
  60. /* conversions without error checking
  61. only works for valid UTF-8, i.e. no 5- or 6-byte sequences
  62. srcsz = source size in bytes, or -1 if 0-terminated
  63. sz = dest size in # of wide characters
  64. returns # characters converted
  65. dest will always be L'\0'-terminated, even if there isn't enough room
  66. for all the characters.
  67. if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
  68. */
  69. int Beefy::u8_toucs(wchar_t *dest, int sz, char *src, int srcsz)
  70. {
  71. wchar_t ch;
  72. char *src_end = src + srcsz;
  73. int nb;
  74. int i=0;
  75. while (i < sz-1) {
  76. nb = trailingBytesForUTF8[(unsigned char)*src];
  77. if (srcsz == -1) {
  78. if (*src == 0)
  79. goto done_toucs;
  80. }
  81. else {
  82. if (src + nb >= src_end)
  83. goto done_toucs;
  84. }
  85. ch = 0;
  86. switch (nb) {
  87. /* these fall through deliberately */
  88. case 3: ch += (unsigned char)*src++; ch <<= 6;
  89. case 2: ch += (unsigned char)*src++; ch <<= 6;
  90. case 1: ch += (unsigned char)*src++; ch <<= 6;
  91. case 0: ch += (unsigned char)*src++;
  92. }
  93. ch -= offsetsFromUTF8[nb];
  94. dest[i++] = ch;
  95. }
  96. done_toucs:
  97. dest[i] = 0;
  98. return i;
  99. }
  100. uint32 Beefy::u8_toucs(const char* src, int srcsz, int* outLen)
  101. {
  102. const char *src_end = src + srcsz;
  103. int nb = trailingBytesForUTF8[(unsigned char)*src];
  104. if (outLen != NULL)
  105. *outLen = nb + 1;
  106. if (srcsz == -1) {
  107. if (*src == 0)
  108. return 0;
  109. }
  110. else {
  111. if (src + nb >= src_end)
  112. return 0;
  113. }
  114. uint32 ch = 0;
  115. switch (nb) {
  116. /* these fall through deliberately */
  117. case 3: ch += (unsigned char)*src++; ch <<= 6;
  118. case 2: ch += (unsigned char)*src++; ch <<= 6;
  119. case 1: ch += (unsigned char)*src++; ch <<= 6;
  120. case 0: ch += (unsigned char)*src++;
  121. }
  122. ch -= offsetsFromUTF8[nb];
  123. return ch;
  124. }
  125. /* srcsz = number of source characters, or -1 if 0-terminated
  126. sz = size of dest buffer in bytes
  127. returns # characters converted
  128. dest will only be '\0'-terminated if there is enough space. this is
  129. for consistency; imagine there are 2 bytes of space left, but the next
  130. character requires 3 bytes. in this case we could NUL-terminate, but in
  131. general we can't when there's insufficient space. therefore this function
  132. only NUL-terminates if all the characters fit, and there's space for
  133. the NUL as well.
  134. the destination string will never be bigger than the source string.
  135. */
  136. int Beefy::u8_toutf8(char *dest, int sz, wchar_t *src, int srcsz)
  137. {
  138. wchar_t ch;
  139. int i = 0;
  140. char *dest_end = dest + sz;
  141. while (srcsz<0 ? src[i]!=0 : i < srcsz) {
  142. ch = src[i];
  143. if (ch < 0x80) {
  144. if (dest >= dest_end)
  145. return i;
  146. *dest++ = (char)ch;
  147. }
  148. else if (ch < 0x800) {
  149. if (dest >= dest_end-1)
  150. return i;
  151. *dest++ = (ch>>6) | 0xC0;
  152. *dest++ = (ch & 0x3F) | 0x80;
  153. }
  154. else if (ch < 0x10000) {
  155. if (dest >= dest_end-2)
  156. return i;
  157. *dest++ = (ch>>12) | 0xE0;
  158. *dest++ = ((ch>>6) & 0x3F) | 0x80;
  159. *dest++ = (ch & 0x3F) | 0x80;
  160. }
  161. else if (ch < 0x110000) {
  162. if (dest >= dest_end-3)
  163. return i;
  164. *dest++ = (ch>>18) | 0xF0;
  165. *dest++ = ((ch>>12) & 0x3F) | 0x80;
  166. *dest++ = ((ch>>6) & 0x3F) | 0x80;
  167. *dest++ = (ch & 0x3F) | 0x80;
  168. }
  169. i++;
  170. }
  171. if (dest < dest_end)
  172. *dest = '\0';
  173. return i;
  174. }
  175. int Beefy::u8_toutf8(char *dest, int sz, uint32 ch)
  176. {
  177. char *dest_end = dest + sz;
  178. int len = 0;
  179. if (ch < 0x80) {
  180. if (dest >= dest_end)
  181. return 1;
  182. len = 1;
  183. *dest++ = (char)ch;
  184. }
  185. else if (ch < 0x800) {
  186. if (dest >= dest_end - 1)
  187. return 2;
  188. len = 2;
  189. *dest++ = (ch >> 6) | 0xC0;
  190. *dest++ = (ch & 0x3F) | 0x80;
  191. }
  192. else if (ch < 0x10000) {
  193. if (dest >= dest_end - 2)
  194. return 3;
  195. len = 3;
  196. *dest++ = (ch >> 12) | 0xE0;
  197. *dest++ = ((ch >> 6) & 0x3F) | 0x80;
  198. *dest++ = (ch & 0x3F) | 0x80;
  199. }
  200. else if (ch < 0x110000) {
  201. if (dest >= dest_end - 3)
  202. return 4;
  203. len = 4;
  204. *dest++ = (ch >> 18) | 0xF0;
  205. *dest++ = ((ch >> 12) & 0x3F) | 0x80;
  206. *dest++ = ((ch >> 6) & 0x3F) | 0x80;
  207. *dest++ = (ch & 0x3F) | 0x80;
  208. }
  209. if (dest < dest_end)
  210. *dest = '\0';
  211. return len;
  212. }
  213. int Beefy::u8_wc_toutf8(char *dest, uint32 ch)
  214. {
  215. if (ch < 0x80) {
  216. dest[0] = (char)ch;
  217. return 1;
  218. }
  219. if (ch < 0x800) {
  220. dest[0] = (ch>>6) | 0xC0;
  221. dest[1] = (ch & 0x3F) | 0x80;
  222. return 2;
  223. }
  224. if (ch < 0x10000) {
  225. dest[0] = (ch>>12) | 0xE0;
  226. dest[1] = ((ch>>6) & 0x3F) | 0x80;
  227. dest[2] = (ch & 0x3F) | 0x80;
  228. return 3;
  229. }
  230. if (ch < 0x110000) {
  231. dest[0] = (ch>>18) | 0xF0;
  232. dest[1] = ((ch>>12) & 0x3F) | 0x80;
  233. dest[2] = ((ch>>6) & 0x3F) | 0x80;
  234. dest[3] = (ch & 0x3F) | 0x80;
  235. return 4;
  236. }
  237. return 0;
  238. }
  239. /* charnum => byte offset */
  240. int Beefy::u8_offset(char *str, int charnum)
  241. {
  242. int offs=0;
  243. while (charnum > 0 && str[offs]) {
  244. (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
  245. isutf(str[++offs]) || ++offs);
  246. charnum--;
  247. }
  248. return offs;
  249. }
  250. /* byte offset => charnum */
  251. int Beefy::u8_charnum(char *s, int offset)
  252. {
  253. int charnum = 0, offs=0;
  254. while (offs < offset && s[offs]) {
  255. (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
  256. isutf(s[++offs]) || ++offs);
  257. charnum++;
  258. }
  259. return charnum;
  260. }
  261. /* number of characters */
  262. int Beefy::u8_strlen(char *s)
  263. {
  264. int count = 0;
  265. int i = 0;
  266. while (u8_nextchar(s, &i) != 0)
  267. count++;
  268. return count;
  269. }
  270. /* reads the next utf-8 sequence out of a string, updating an index */
  271. uint32 Beefy::u8_nextchar(char *s, int *i)
  272. {
  273. uint32 ch = 0;
  274. int sz = 0;
  275. do {
  276. ch <<= 6;
  277. ch += (unsigned char)s[(*i)++];
  278. sz++;
  279. } while ((ch != 0) && s[*i] && !isutf(s[*i]));
  280. ch -= offsetsFromUTF8[sz-1];
  281. return ch;
  282. }
  283. void Beefy::u8_inc(char *s, int *i)
  284. {
  285. (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
  286. isutf(s[++(*i)]) || ++(*i));
  287. }
  288. void Beefy::u8_dec(char *s, int *i)
  289. {
  290. (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
  291. isutf(s[--(*i)]) || --(*i));
  292. }
  293. int Beefy::octal_digit(char c)
  294. {
  295. return (c >= '0' && c <= '7');
  296. }
  297. int Beefy::hex_digit(char c)
  298. {
  299. return ((c >= '0' && c <= '9') ||
  300. (c >= 'A' && c <= 'F') ||
  301. (c >= 'a' && c <= 'f'));
  302. }
  303. /* assumes that src points to the character after a backslash
  304. returns number of input characters processed */
  305. int Beefy::u8_read_escape_sequence(char *str, uint32 *dest)
  306. {
  307. uint32 ch;
  308. char digs[9]="\0\0\0\0\0\0\0\0";
  309. int dno=0, i=1;
  310. ch = (uint32)str[0]; /* take literal character */
  311. if (str[0] == 'n')
  312. ch = L'\n';
  313. else if (str[0] == 't')
  314. ch = L'\t';
  315. else if (str[0] == 'r')
  316. ch = L'\r';
  317. else if (str[0] == 'b')
  318. ch = L'\b';
  319. else if (str[0] == 'f')
  320. ch = L'\f';
  321. else if (str[0] == 'v')
  322. ch = L'\v';
  323. else if (str[0] == 'a')
  324. ch = L'\a';
  325. else if (octal_digit(str[0])) {
  326. i = 0;
  327. do {
  328. digs[dno++] = str[i++];
  329. } while (octal_digit(str[i]) && dno < 3);
  330. ch = (uint32)strtol(digs, NULL, 8);
  331. }
  332. else if (str[0] == 'x') {
  333. while (hex_digit(str[i]) && dno < 2) {
  334. digs[dno++] = str[i++];
  335. }
  336. if (dno > 0)
  337. ch = (uint32)strtol(digs, NULL, 16);
  338. }
  339. else if (str[0] == 'u') {
  340. while (hex_digit(str[i]) && dno < 4) {
  341. digs[dno++] = str[i++];
  342. }
  343. if (dno > 0)
  344. ch = (uint32)strtol(digs, NULL, 16);
  345. }
  346. else if (str[0] == 'U') {
  347. while (hex_digit(str[i]) && dno < 8) {
  348. digs[dno++] = str[i++];
  349. }
  350. if (dno > 0)
  351. ch = (uint32)strtol(digs, NULL, 16);
  352. }
  353. *dest = ch;
  354. return i;
  355. }
  356. /* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
  357. example: u8_unescape(mybuf, 256, "hello\\u220e")
  358. note the double backslash is needed if called on a C string literal */
  359. int Beefy::u8_unescape(char *buf, int sz, char *src)
  360. {
  361. int c=0, amt;
  362. uint32 ch;
  363. char temp[4];
  364. while (*src && c < sz) {
  365. if (*src == '\\') {
  366. src++;
  367. amt = u8_read_escape_sequence(src, &ch);
  368. }
  369. else {
  370. ch = (uint32)*src;
  371. amt = 1;
  372. }
  373. src += amt;
  374. amt = u8_wc_toutf8(temp, ch);
  375. if (amt > sz-c)
  376. break;
  377. memcpy(&buf[c], temp, amt);
  378. c += amt;
  379. }
  380. if (c < sz)
  381. buf[c] = '\0';
  382. return c;
  383. }
  384. #ifdef _WIN32
  385. #define snprintf _snprintf
  386. #pragma warning (disable:4996)
  387. #endif
  388. int Beefy::u8_escape_wchar(char *buf, int sz, uint32 ch)
  389. {
  390. if (ch == L'\n')
  391. return snprintf(buf, sz, "\\n");
  392. else if (ch == L'\t')
  393. return snprintf(buf, sz, "\\t");
  394. else if (ch == L'\r')
  395. return snprintf(buf, sz, "\\r");
  396. else if (ch == L'\b')
  397. return snprintf(buf, sz, "\\b");
  398. else if (ch == L'\f')
  399. return snprintf(buf, sz, "\\f");
  400. else if (ch == L'\v')
  401. return snprintf(buf, sz, "\\v");
  402. else if (ch == L'\a')
  403. return snprintf(buf, sz, "\\a");
  404. else if (ch == L'\\')
  405. return snprintf(buf, sz, "\\\\");
  406. else if (ch < 32 || ch == 0x7f)
  407. return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
  408. else if (ch > 0xFFFF)
  409. return snprintf(buf, sz, "\\U%.8X", (uint32)ch);
  410. else if (ch >= 0x80 && ch <= 0xFFFF)
  411. return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);
  412. return snprintf(buf, sz, "%c", (char)ch);
  413. }
  414. int Beefy::u8_escape(char *buf, int sz, char *src, int escape_quotes)
  415. {
  416. int c=0, i=0, amt;
  417. while (src[i] && c < sz) {
  418. if (escape_quotes && src[i] == '"') {
  419. amt = snprintf(buf, sz - c, "\\\"");
  420. i++;
  421. }
  422. else {
  423. amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
  424. }
  425. c += amt;
  426. buf += amt;
  427. }
  428. if (c < sz)
  429. *buf = '\0';
  430. return c;
  431. }
  432. char* Beefy::u8_strchr(char *s, uint32 ch, int *charn)
  433. {
  434. int i = 0, lasti=0;
  435. uint32 c;
  436. *charn = 0;
  437. while (s[i]) {
  438. c = u8_nextchar(s, &i);
  439. if (c == ch) {
  440. return &s[lasti];
  441. }
  442. lasti = i;
  443. (*charn)++;
  444. }
  445. return NULL;
  446. }
  447. char* Beefy::u8_memchr(char *s, uint32 ch, size_t sz, int *charn)
  448. {
  449. int i = 0, lasti=0;
  450. uint32 c;
  451. int csz;
  452. *charn = 0;
  453. while (i < (int)sz) {
  454. c = csz = 0;
  455. do {
  456. c <<= 6;
  457. c += (unsigned char)s[i++];
  458. csz++;
  459. } while (i < (int)sz && !isutf(s[i]));
  460. c -= offsetsFromUTF8[csz-1];
  461. if (c == ch) {
  462. return &s[lasti];
  463. }
  464. lasti = i;
  465. (*charn)++;
  466. }
  467. return NULL;
  468. }
  469. int Beefy::u8_is_locale_utf8(char *locale)
  470. {
  471. /* this code based on libutf8 */
  472. const char* cp = locale;
  473. for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
  474. if (*cp == '.') {
  475. const char* encoding = ++cp;
  476. for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
  477. ;
  478. if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
  479. || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
  480. return 1; /* it's UTF-8 */
  481. break;
  482. }
  483. }
  484. return 0;
  485. }
  486. int Beefy::u8_vprintf(char *fmt, va_list ap)
  487. {
  488. int cnt, sz=0;
  489. char *buf;
  490. wchar_t *wcs;
  491. sz = 512;
  492. buf = (char*)alloca(sz);
  493. try_print:
  494. cnt = vsnprintf(buf, sz, fmt, ap);
  495. if (cnt >= sz) {
  496. buf = (char*)alloca(cnt - sz + 1);
  497. sz = cnt + 1;
  498. goto try_print;
  499. }
  500. wcs = (wchar_t*)alloca((cnt+1) * sizeof(wchar_t));
  501. cnt = u8_toucs(wcs, cnt+1, buf, cnt);
  502. printf("%ls", (wchar_t*)wcs);
  503. return cnt;
  504. }
  505. int Beefy::u8_printf(char *fmt, ...)
  506. {
  507. int cnt;
  508. va_list args;
  509. va_start(args, fmt);
  510. cnt = u8_vprintf(fmt, args);
  511. va_end(args);
  512. return cnt;
  513. }
  514. bool Beefy::UTF8IsCombiningMark(uint32 c)
  515. {
  516. return ((c >= 0x0300) && (c <= 0x036F)) || ((c >= 0x1DC0) && (c <= 0x1DFF));
  517. }
  518. bool Beefy::UTF8GetGraphemeClusterSpan(const char* str, int strLength, int idx, int& startIdx, int& spanLength)
  519. {
  520. const char* ptr = str;
  521. // Move to start of char
  522. while (startIdx >= 0)
  523. {
  524. char c = ptr[startIdx];
  525. if (((uint8)c & 0x80) == 0)
  526. {
  527. // This is the simple and fast case - ASCII followed by the string end or more ASCII
  528. if ((startIdx == strLength - 1) || (((uint8)ptr[startIdx + 1] & 0x80) == 0))
  529. {
  530. spanLength = 1;
  531. return true;
  532. }
  533. break;
  534. }
  535. if (((uint8)c & 0xC0) != 0x80)
  536. {
  537. uint32 c32 = u8_toucs(ptr + startIdx, strLength - startIdx);
  538. if (!UTF8IsCombiningMark(c32))
  539. break;
  540. }
  541. startIdx--;
  542. }
  543. int curIdx = startIdx;
  544. while (true)
  545. {
  546. int cLen = 0;
  547. uint32 c32 = u8_toucs(ptr + startIdx, strLength - startIdx, &cLen);
  548. int nextIdx = curIdx + cLen;
  549. if ((curIdx != startIdx) && (!UTF8IsCombiningMark(c32)))
  550. {
  551. spanLength = curIdx - startIdx;
  552. return true;
  553. }
  554. if (nextIdx == strLength)
  555. {
  556. spanLength = nextIdx - startIdx;
  557. return false;
  558. }
  559. curIdx = nextIdx;
  560. }
  561. }
  562. void Beefy::UTF8Categorize(const char* str, int strLength, int& numCodePoints, int& numCombiningMarks)
  563. {
  564. numCodePoints = 0;
  565. numCombiningMarks = 0;
  566. int offset = 0;
  567. while (offset < strLength)
  568. {
  569. int cLen = 0;
  570. uint32 c32 = u8_toucs(str + offset, strLength - offset, &cLen);
  571. numCodePoints++;
  572. if (UTF8IsCombiningMark(c32))
  573. numCombiningMarks++;
  574. offset += cLen;
  575. }
  576. }