utf8.cpp 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #include "utf8/utf8.h"
  2. namespace utf8 {
  3. /** is c the start of a UTF-8 sequence? */
  4. #define isutf(c) (((c)&0xC0)!=0x80)
  5. UCS4 decode(const char *&src, const char *src_end)
  6. {
  7. const char* start = src;
  8. if (src == src_end)
  9. return 0;
  10. unsigned char ch1 = *(src++);
  11. if (ch1 <= 0x7F)
  12. return ch1;
  13. // should not have continuation character here
  14. if ((ch1 & 0xC0) != 0x80 && src < src_end)
  15. {
  16. unsigned char ch2 = *(src++);
  17. // should be continuation
  18. if ((ch2 & 0xC0) != 0x80)
  19. goto invalid;
  20. if ((ch1 & 0xE0) == 0xC0)
  21. {
  22. if ((ch2 & 0xC0) == 0x80)
  23. {
  24. unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
  25. if (rv >= 0x80)
  26. return rv;
  27. }
  28. goto invalid;
  29. }
  30. if (src < src_end)
  31. {
  32. unsigned char ch3 = *(src++);
  33. // should be continuation
  34. if ((ch3 & 0xC0) != 0x80)
  35. goto invalid;
  36. if ((ch1 & 0xF0) == 0xE0)
  37. {
  38. unsigned rv = ((ch1 & 0x0F) << 12)
  39. + ((ch2 & 0x3F) << 6)
  40. + (ch3 & 0x3F);
  41. if (rv <= 0x800)
  42. goto invalid;
  43. return rv;
  44. }
  45. if (src < src_end)
  46. {
  47. unsigned char ch4 = *(src++);
  48. if ((ch4 & 0xC0) != 0x80)
  49. goto invalid;
  50. UCS4 rv = ((ch1 & 0x07) << 18)
  51. + ((ch2 & 0x3F) << 12)
  52. + ((ch3 & 0x3F) << 6)
  53. + (ch4 & 0x3F);
  54. if (rv > 0xFFFF)
  55. return rv;
  56. }
  57. }
  58. }
  59. invalid:
  60. src = start;
  61. return 0xFFFF;
  62. }
  63. int encode(UCS4 ch, char *dst)
  64. {
  65. if (ch < 0x80) {
  66. dst[0] = (char) ch;
  67. return 1;
  68. } else if (ch < 0x800) {
  69. dst[0] = (char) (0xC0 + (ch >> 6));
  70. dst[1] = (char) (0x80 + (ch & 0x3F));
  71. return 2;
  72. } else if (ch < 0x10000) {
  73. dst[0] = (char) (0xE0 + (ch >> 12));
  74. dst[1] = (char) (0x80 + ((ch >> 6) & 0x3F));
  75. dst[2] = (char) (0x80 + (ch & 0x3F));
  76. return 3;
  77. } else if (ch <= 0x10FFFF) {
  78. dst[0] = (char) (0xF0 + (ch >> 18));
  79. dst[1] = (char) (0x80 + ((ch >> 12) & 0x3F));
  80. dst[2] = (char) (0x80 + ((ch >> 6) & 0x3F));
  81. dst[3] = (char) (0x80 + (ch & 0x3F));
  82. return 4;
  83. } else {
  84. // output UTF-8 encoding of 0xFFFF
  85. dst[0] = (char)0xEF;
  86. dst[1] = (char)0xBF;
  87. dst[2] = (char)0xBF;
  88. return 3;
  89. }
  90. }
  91. UCS4 decode_next(const char *str, int *i, int i_max)
  92. {
  93. str += *i;
  94. i_max -= *i;
  95. const char *old_str = str;
  96. // Handle wrapping that could happen if the caller use
  97. // something really large for i_max if src is known to
  98. // be null terminated (like TB_ALL_TO_TERMINATION)
  99. const char *str_end = str + i_max;
  100. if (str_end < str)
  101. str_end = (const char *) (-1);
  102. UCS4 ch = decode(str, str_end);
  103. if (ch == 0xFFFF) // Invalid character!
  104. (*i)++;
  105. else
  106. *i += str - old_str;
  107. return ch;
  108. }
  109. void move_inc(const char *str, int *i, int i_max)
  110. {
  111. (void) ((*i < i_max && isutf(str[++(*i)])) ||
  112. (*i < i_max && isutf(str[++(*i)])) ||
  113. (*i < i_max && isutf(str[++(*i)])) ||
  114. (*i < i_max && ++(*i)));
  115. }
  116. void move_dec(const char *str, int *i)
  117. {
  118. (void) ((*i > 0 && isutf(str[--(*i)])) ||
  119. (*i > 0 && isutf(str[--(*i)])) ||
  120. (*i > 0 && isutf(str[--(*i)])) ||
  121. (*i > 0 && --(*i)));
  122. }
  123. int count_characters(const char *str, int i_max)
  124. {
  125. int count = 0;
  126. int i = 0;
  127. while (i < i_max && decode_next(str, &i, i_max))
  128. count++;
  129. return count;
  130. }
  131. }; // namespace utf8