utf8.odin 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. package utf8
  2. RUNE_ERROR :: '\ufffd';
  3. RUNE_SELF :: 0x80;
  4. RUNE_BOM :: 0xfeff;
  5. RUNE_EOF :: ~rune(0);
  6. MAX_RUNE :: '\U0010ffff';
  7. UTF_MAX :: 4;
  8. SURROGATE_MIN :: 0xd800;
  9. SURROGATE_MAX :: 0xdfff;
  10. T1 :: 0b0000_0000;
  11. TX :: 0b1000_0000;
  12. T2 :: 0b1100_0000;
  13. T3 :: 0b1110_0000;
  14. T4 :: 0b1111_0000;
  15. T5 :: 0b1111_1000;
  16. MASKX :: 0b0011_1111;
  17. MASK2 :: 0b0001_1111;
  18. MASK3 :: 0b0000_1111;
  19. MASK4 :: 0b0000_0111;
  20. RUNE1_MAX :: 1<<7 - 1;
  21. RUNE2_MAX :: 1<<11 - 1;
  22. RUNE3_MAX :: 1<<16 - 1;
  23. // The default lowest and highest continuation byte.
  24. LOCB :: 0b1000_0000;
  25. HICB :: 0b1011_1111;
  26. Accept_Range :: struct {lo, hi: u8};
  27. accept_ranges := [5]Accept_Range{
  28. {0x80, 0xbf},
  29. {0xa0, 0xbf},
  30. {0x80, 0x9f},
  31. {0x90, 0xbf},
  32. {0x80, 0x8f},
  33. };
  34. accept_sizes := [256]u8{
  35. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
  36. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
  37. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
  38. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
  39. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
  40. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
  41. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
  42. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
  43. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
  44. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
  45. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
  46. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
  47. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
  48. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
  49. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
  50. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
  51. };
  52. encode_rune :: proc(r: rune) -> ([4]u8, int) {
  53. buf: [4]u8;
  54. i := u32(r);
  55. mask :: u8(0x3f);
  56. if i <= 1<<7-1 {
  57. buf[0] = u8(r);
  58. return buf, 1;
  59. }
  60. if i <= 1<<11-1 {
  61. buf[0] = 0xc0 | u8(r>>6);
  62. buf[1] = 0x80 | u8(r) & mask;
  63. return buf, 2;
  64. }
  65. // Invalid or Surrogate range
  66. if i > 0x0010ffff ||
  67. (0xd800 <= i && i <= 0xdfff) {
  68. r = 0xfffd;
  69. }
  70. if i <= 1<<16-1 {
  71. buf[0] = 0xe0 | u8(r>>12);
  72. buf[1] = 0x80 | u8(r>>6) & mask;
  73. buf[2] = 0x80 | u8(r) & mask;
  74. return buf, 3;
  75. }
  76. buf[0] = 0xf0 | u8(r>>18);
  77. buf[1] = 0x80 | u8(r>>12) & mask;
  78. buf[2] = 0x80 | u8(r>>6) & mask;
  79. buf[3] = 0x80 | u8(r) & mask;
  80. return buf, 4;
  81. }
  82. decode_rune_from_string :: inline proc(s: string) -> (rune, int) do return decode_rune(cast([]u8)s);
  83. decode_rune :: proc(s: []u8) -> (rune, int) {
  84. n := len(s);
  85. if n < 1 {
  86. return RUNE_ERROR, 0;
  87. }
  88. s0 := s[0];
  89. x := accept_sizes[s0];
  90. if x >= 0xF0 {
  91. mask := rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
  92. return rune(s[0])&~mask | RUNE_ERROR&mask, 1;
  93. }
  94. sz := x & 7;
  95. accept := accept_ranges[x>>4];
  96. if n < int(sz) {
  97. return RUNE_ERROR, 1;
  98. }
  99. b1 := s[1];
  100. if b1 < accept.lo || accept.hi < b1 {
  101. return RUNE_ERROR, 1;
  102. }
  103. if sz == 2 {
  104. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2;
  105. }
  106. b2 := s[2];
  107. if b2 < LOCB || HICB < b2 {
  108. return RUNE_ERROR, 1;
  109. }
  110. if sz == 3 {
  111. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3;
  112. }
  113. b3 := s[3];
  114. if b3 < LOCB || HICB < b3 {
  115. return RUNE_ERROR, 1;
  116. }
  117. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4;
  118. }
  119. decode_last_rune_from_string :: inline proc(s: string) -> (rune, int) do return decode_last_rune(cast([]u8)s);
  120. decode_last_rune :: proc(s: []u8) -> (rune, int) {
  121. r: rune;
  122. size: int;
  123. start, end, limit: int;
  124. end = len(s);
  125. if end == 0 {
  126. return RUNE_ERROR, 0;
  127. }
  128. start = end-1;
  129. r = rune(s[start]);
  130. if r < RUNE_SELF {
  131. return r, 1;
  132. }
  133. limit = max(end - UTF_MAX, 0);
  134. for start-=1; start >= limit; start-=1 {
  135. if rune_start(s[start]) do break;
  136. }
  137. start = max(start, 0);
  138. r, size = decode_rune(s[start:end]);
  139. if start+size != end {
  140. return RUNE_ERROR, 1;
  141. }
  142. return r, size;
  143. }
  144. valid_rune :: proc(r: rune) -> bool {
  145. if r < 0 {
  146. return false;
  147. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  148. return false;
  149. } else if r > MAX_RUNE {
  150. return false;
  151. }
  152. return true;
  153. }
  154. valid_string :: proc(s: string) -> bool {
  155. n := len(s);
  156. for i := 0; i < n; {
  157. si := s[i];
  158. if si < RUNE_SELF { // ascii
  159. i += 1;
  160. continue;
  161. }
  162. x := accept_sizes[si];
  163. if x == 0xf1 {
  164. return false;
  165. }
  166. size := int(x & 7);
  167. if i+size > n {
  168. return false;
  169. }
  170. ar := accept_ranges[x>>4];
  171. if b := s[i+1]; b < ar.lo || ar.hi < b {
  172. return false;
  173. } else if size == 2 {
  174. // Okay
  175. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  176. return false;
  177. } else if size == 3 {
  178. // Okay
  179. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  180. return false;
  181. }
  182. i += size;
  183. }
  184. return true;
  185. }
  186. rune_start :: inline proc(b: u8) -> bool do return b&0xc0 != 0x80;
  187. rune_count_from_string :: inline proc(s: string) -> int do return rune_count(cast([]u8)s);
  188. rune_count :: proc(s: []u8) -> int {
  189. count := 0;
  190. n := len(s);
  191. for i := 0; i < n; {
  192. defer count += 1;
  193. si := s[i];
  194. if si < RUNE_SELF { // ascii
  195. i += 1;
  196. continue;
  197. }
  198. x := accept_sizes[si];
  199. if x == 0xf1 {
  200. i += 1;
  201. continue;
  202. }
  203. size := int(x & 7);
  204. if i+size > n {
  205. i += 1;
  206. continue;
  207. }
  208. ar := accept_ranges[x>>4];
  209. if b := s[i+1]; b < ar.lo || ar.hi < b {
  210. size = 1;
  211. } else if size == 2 {
  212. // Okay
  213. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  214. size = 1;
  215. } else if size == 3 {
  216. // Okay
  217. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  218. size = 1;
  219. }
  220. i += size;
  221. }
  222. return count;
  223. }
  224. rune_size :: proc(r: rune) -> int {
  225. switch {
  226. case r < 0: return -1;
  227. case r <= 1<<7 - 1: return 1;
  228. case r <= 1<<11 - 1: return 2;
  229. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
  230. case r <= 1<<16 - 1: return 3;
  231. case r <= MAX_RUNE: return 4;
  232. }
  233. return -1;
  234. }