utf8.odin 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. RUNE_ERROR :: '\ufffd';
  2. RUNE_SELF :: 0x80;
  3. RUNE_BOM :: 0xfeff;
  4. RUNE_EOF :: ~cast(rune)0;
  5. MAX_RUNE :: '\U0010ffff';
  6. UTF_MAX :: 4;
  7. SURROGATE_MIN :: 0xd800;
  8. SURROGATE_MAX :: 0xdfff;
  9. T1 :: 0b0000_0000;
  10. TX :: 0b1000_0000;
  11. T2 :: 0b1100_0000;
  12. T3 :: 0b1110_0000;
  13. T4 :: 0b1111_0000;
  14. T5 :: 0b1111_1000;
  15. MASKX :: 0b0011_1111;
  16. MASK2 :: 0b0001_1111;
  17. MASK3 :: 0b0000_1111;
  18. MASK4 :: 0b0000_0111;
  19. RUNE1_MAX :: 1<<7 - 1;
  20. RUNE2_MAX :: 1<<11 - 1;
  21. RUNE3_MAX :: 1<<16 - 1;
  22. // The default lowest and highest continuation byte.
  23. LOCB :: 0b1000_0000;
  24. HICB :: 0b1011_1111;
  25. Accept_Range :: struct { lo, hi: u8 }
  26. immutable accept_ranges := [5]Accept_Range{
  27. {0x80, 0xbf},
  28. {0xa0, 0xbf},
  29. {0x80, 0x9f},
  30. {0x90, 0xbf},
  31. {0x80, 0x8f},
  32. };
  33. immutable accept_sizes := [256]byte{
  34. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
  35. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
  36. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
  37. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
  38. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
  39. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
  40. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
  41. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
  42. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
  43. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
  44. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
  45. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
  46. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
  47. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
  48. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
  49. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
  50. };
  51. encode_rune :: proc(r: rune) -> ([4]byte, int) {
  52. buf: [4]byte;
  53. i := cast(u32)r;
  54. mask: byte : 0x3f;
  55. if i <= 1<<7-1 {
  56. buf[0] = cast(byte)r;
  57. return buf, 1;
  58. }
  59. if i <= 1<<11-1 {
  60. buf[0] = 0xc0 | cast(byte)(r>>6);
  61. buf[1] = 0x80 | cast(byte)r & mask;
  62. return buf, 2;
  63. }
  64. // Invalid or Surrogate range
  65. if i > 0x0010ffff ||
  66. (0xd800 <= i && i <= 0xdfff) {
  67. r = 0xfffd;
  68. }
  69. if i <= 1<<16-1 {
  70. buf[0] = 0xe0 | cast(byte)(r>>12);
  71. buf[1] = 0x80 | cast(byte)(r>>6) & mask;
  72. buf[2] = 0x80 | cast(byte)r & mask;
  73. return buf, 3;
  74. }
  75. buf[0] = 0xf0 | cast(byte)(r>>18);
  76. buf[1] = 0x80 | cast(byte)(r>>12) & mask;
  77. buf[2] = 0x80 | cast(byte)(r>>6) & mask;
  78. buf[3] = 0x80 | cast(byte)r & mask;
  79. return buf, 4;
  80. }
  81. decode_rune :: proc(s: string) -> (rune, int) #inline { return decode_rune(cast([]byte)s); }
  82. decode_rune :: proc(s: []byte) -> (rune, int) {
  83. n := s.count;
  84. if n < 1 {
  85. return RUNE_ERROR, 0;
  86. }
  87. s0 := s[0];
  88. x := accept_sizes[s0];
  89. if x >= 0xF0 {
  90. mask := cast(rune)(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
  91. return cast(rune)(s[0])&~mask | RUNE_ERROR&mask, 1;
  92. }
  93. sz := x & 7;
  94. accept := accept_ranges[x>>4];
  95. if n < cast(int)sz {
  96. return RUNE_ERROR, 1;
  97. }
  98. b1 := s[1];
  99. if b1 < accept.lo || accept.hi < b1 {
  100. return RUNE_ERROR, 1;
  101. }
  102. if sz == 2 {
  103. return cast(rune)(s0&MASK2)<<6 | cast(rune)(b1&MASKX), 2;
  104. }
  105. b2 := s[2];
  106. if b2 < LOCB || HICB < b2 {
  107. return RUNE_ERROR, 1;
  108. }
  109. if sz == 3 {
  110. return cast(rune)(s0&MASK3)<<12 | cast(rune)(b1&MASKX)<<6 | cast(rune)(b2&MASKX), 3;
  111. }
  112. b3 := s[3];
  113. if b3 < LOCB || HICB < b3 {
  114. return RUNE_ERROR, 1;
  115. }
  116. return cast(rune)(s0&MASK4)<<18 | cast(rune)(b1&MASKX)<<12 | cast(rune)(b2&MASKX)<<6 | cast(rune)(b3&MASKX), 4;
  117. }
  118. decode_last_rune :: proc(s: string) -> (rune, int) #inline { return decode_last_rune(cast([]byte)s); }
  119. decode_last_rune :: proc(s: []byte) -> (rune, int) {
  120. r: rune;
  121. size: int;
  122. start, end, limit: int;
  123. end = s.count;
  124. if end == 0 {
  125. return RUNE_ERROR, 0;
  126. }
  127. start = end-1;
  128. r = cast(rune)s[start];
  129. if r < RUNE_SELF {
  130. return r, 1;
  131. }
  132. limit = max(end - UTF_MAX, 0);
  133. start--;
  134. for start >= limit {
  135. if rune_start(s[start]) {
  136. break;
  137. }
  138. start--;
  139. }
  140. start = max(start, 0);
  141. r, size = decode_rune(s[start..end]);
  142. if start+size != end {
  143. return RUNE_ERROR, 1;
  144. }
  145. return r, size;
  146. }
  147. valid_rune :: proc(r: rune) -> bool {
  148. if r < 0 {
  149. return false;
  150. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  151. return false;
  152. } else if r > MAX_RUNE {
  153. return false;
  154. }
  155. return true;
  156. }
  157. valid_string :: proc(s: string) -> bool {
  158. n := s.count;
  159. for i := 0; i < n; {
  160. si := s[i];
  161. if si < RUNE_SELF { // ascii
  162. i++;
  163. continue;
  164. }
  165. x := accept_sizes[si];
  166. if x == 0xf1 {
  167. return false;
  168. }
  169. size := cast(int)(x & 7);
  170. if i+size > n {
  171. return false;
  172. }
  173. ar := accept_ranges[x>>4];
  174. if b := s[i+1]; b < ar.lo || ar.hi < b {
  175. return false;
  176. } else if size == 2 {
  177. // Okay
  178. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  179. return false;
  180. } else if size == 3 {
  181. // Okay
  182. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  183. return false;
  184. }
  185. i += size;
  186. }
  187. return true;
  188. }
  189. rune_start :: proc(b: byte) -> bool #inline { return b&0xc0 != 0x80; }
  190. rune_count :: proc(s: string) -> int #inline { return rune_count(cast([]byte)s); }
  191. rune_count :: proc(s: []byte) -> int {
  192. count := 0;
  193. n := s.count;
  194. for i := 0; i < n; {
  195. defer count++;
  196. si := s[i];
  197. if si < RUNE_SELF { // ascii
  198. i++;
  199. continue;
  200. }
  201. x := accept_sizes[si];
  202. if x == 0xf1 {
  203. i++;
  204. continue;
  205. }
  206. size := cast(int)(x & 7);
  207. if i+size > n {
  208. i++;
  209. continue;
  210. }
  211. ar := accept_ranges[x>>4];
  212. if b := s[i+1]; b < ar.lo || ar.hi < b {
  213. size = 1;
  214. } else if size == 2 {
  215. // Okay
  216. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  217. size = 1;
  218. } else if size == 3 {
  219. // Okay
  220. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  221. size = 1;
  222. }
  223. i += size;
  224. }
  225. return count;
  226. }
  227. rune_size :: proc(r: rune) -> int {
  228. match {
  229. case r < 0: return -1;
  230. case r <= 1<<7 - 1: return 1;
  231. case r <= 1<<11 - 1: return 2;
  232. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
  233. case r <= 1<<16 - 1: return 3;
  234. case r <= MAX_RUNE: return 4;
  235. }
  236. return -1;
  237. }