unicode.cpp 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #pragma warning(push)
  2. #pragma warning(disable: 4245)
  3. extern "C" {
  4. #include "utf8proc/utf8proc.c"
  5. }
  6. #pragma warning(pop)
  7. bool rune_is_letter(Rune r) {
  8. if (r < 0x80) {
  9. if (r == '_') {
  10. return true;
  11. }
  12. return ((cast(u32)r | 0x20) - 0x61) < 26;
  13. }
  14. switch (utf8proc_category(r)) {
  15. case UTF8PROC_CATEGORY_LU:
  16. case UTF8PROC_CATEGORY_LL:
  17. case UTF8PROC_CATEGORY_LT:
  18. case UTF8PROC_CATEGORY_LM:
  19. case UTF8PROC_CATEGORY_LO:
  20. return true;
  21. }
  22. return false;
  23. }
  24. bool rune_is_digit(Rune r) {
  25. if (r < 0x80) {
  26. return (cast(u32)r - '0') < 10;
  27. }
  28. return utf8proc_category(r) == UTF8PROC_CATEGORY_ND;
  29. }
  30. bool rune_is_letter_or_digit(Rune r) {
  31. if (r < 0x80) {
  32. if (r == '_') {
  33. return true;
  34. }
  35. if (((cast(u32)r | 0x20) - 0x61) < 26) {
  36. return true;
  37. }
  38. return (cast(u32)r - '0') < 10;
  39. }
  40. switch (utf8proc_category(r)) {
  41. case UTF8PROC_CATEGORY_LU:
  42. case UTF8PROC_CATEGORY_LL:
  43. case UTF8PROC_CATEGORY_LT:
  44. case UTF8PROC_CATEGORY_LM:
  45. case UTF8PROC_CATEGORY_LO:
  46. return true;
  47. case UTF8PROC_CATEGORY_ND:
  48. return true;
  49. }
  50. return false;
  51. }
  52. bool rune_is_whitespace(Rune r) {
  53. switch (r) {
  54. case ' ':
  55. case '\t':
  56. case '\n':
  57. case '\r':
  58. return true;
  59. }
  60. return false;
  61. }
  62. gb_global u8 const global__utf8_first[256] = {
  63. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0F
  64. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1F
  65. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2F
  66. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3F
  67. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4F
  68. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5F
  69. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6F
  70. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7F
  71. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8F
  72. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9F
  73. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xA0-0xAF
  74. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xB0-0xBF
  75. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xC0-0xCF
  76. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xD0-0xDF
  77. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xE0-0xEF
  78. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xF0-0xFF
  79. };
  80. typedef struct Utf8AcceptRange {
  81. u8 lo, hi;
  82. } Utf8AcceptRange;
  83. gb_global Utf8AcceptRange const global__utf8_accept_ranges[] = {
  84. {0x80, 0xbf},
  85. {0xa0, 0xbf},
  86. {0x80, 0x9f},
  87. {0x90, 0xbf},
  88. {0x80, 0x8f},
  89. };
  90. isize utf8_decode(u8 const *str, isize str_len, Rune *codepoint_out) {
  91. isize width = 0;
  92. Rune codepoint = GB_RUNE_INVALID;
  93. if (str_len > 0) {
  94. u8 s0 = str[0];
  95. u8 x = global__utf8_first[s0], sz;
  96. u8 b1, b2, b3;
  97. Utf8AcceptRange accept;
  98. if (x >= 0xf0) {
  99. Rune mask = (cast(Rune)x << 31) >> 31;
  100. codepoint = (cast(Rune)s0 & (~mask)) | (GB_RUNE_INVALID & mask);
  101. width = 1;
  102. goto end;
  103. }
  104. if (s0 < 0x80) {
  105. codepoint = s0;
  106. width = 1;
  107. goto end;
  108. }
  109. sz = x&7;
  110. accept = global__utf8_accept_ranges[x>>4];
  111. if (str_len < gb_size_of(sz))
  112. goto invalid_codepoint;
  113. b1 = str[1];
  114. if (b1 < accept.lo || accept.hi < b1)
  115. goto invalid_codepoint;
  116. if (sz == 2) {
  117. codepoint = (cast(Rune)s0&0x1f)<<6 | (cast(Rune)b1&0x3f);
  118. width = 2;
  119. goto end;
  120. }
  121. b2 = str[2];
  122. if (!gb_is_between(b2, 0x80, 0xbf))
  123. goto invalid_codepoint;
  124. if (sz == 3) {
  125. codepoint = (cast(Rune)s0&0x1f)<<12 | (cast(Rune)b1&0x3f)<<6 | (cast(Rune)b2&0x3f);
  126. width = 3;
  127. goto end;
  128. }
  129. b3 = str[3];
  130. if (!gb_is_between(b3, 0x80, 0xbf))
  131. goto invalid_codepoint;
  132. codepoint = (cast(Rune)s0&0x07)<<18 | (cast(Rune)b1&0x3f)<<12 | (cast(Rune)b2&0x3f)<<6 | (cast(Rune)b3&0x3f);
  133. width = 4;
  134. goto end;
  135. invalid_codepoint:
  136. codepoint = GB_RUNE_INVALID;
  137. width = 1;
  138. }
  139. end:
  140. if (codepoint_out) *codepoint_out = codepoint;
  141. return width;
  142. }