utf8.odin 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. const (
  2. RUNE_ERROR = '\ufffd';
  3. RUNE_SELF = 0x80;
  4. RUNE_BOM = 0xfeff;
  5. RUNE_EOF = ~rune(0);
  6. MAX_RUNE = '\U0010ffff';
  7. UTF_MAX = 4;
  8. SURROGATE_MIN = 0xd800;
  9. SURROGATE_MAX = 0xdfff;
  10. T1 = 0b0000_0000;
  11. TX = 0b1000_0000;
  12. T2 = 0b1100_0000;
  13. T3 = 0b1110_0000;
  14. T4 = 0b1111_0000;
  15. T5 = 0b1111_1000;
  16. MASKX = 0b0011_1111;
  17. MASK2 = 0b0001_1111;
  18. MASK3 = 0b0000_1111;
  19. MASK4 = 0b0000_0111;
  20. RUNE1_MAX = 1<<7 - 1;
  21. RUNE2_MAX = 1<<11 - 1;
  22. RUNE3_MAX = 1<<16 - 1;
  23. // The default lowest and highest continuation byte.
  24. LOCB = 0b1000_0000;
  25. HICB = 0b1011_1111;
  26. )
  27. type AcceptRange struct { lo, hi: u8 }
  28. let (
  29. accept_ranges = [5]AcceptRange{
  30. {0x80, 0xbf},
  31. {0xa0, 0xbf},
  32. {0x80, 0x9f},
  33. {0x90, 0xbf},
  34. {0x80, 0x8f},
  35. };
  36. accept_sizes = [256]u8{
  37. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
  38. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
  39. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
  40. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
  41. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
  42. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
  43. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
  44. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
  45. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
  46. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
  47. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
  48. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
  49. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
  50. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
  51. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
  52. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
  53. };
  54. )
  55. proc encode_rune(r: rune) -> ([4]u8, int) {
  56. var buf: [4]u8;
  57. var i = u32(r);
  58. const mask: u8 = 0x3f;
  59. if i <= 1<<7-1 {
  60. buf[0] = u8(r);
  61. return buf, 1;
  62. }
  63. if i <= 1<<11-1 {
  64. buf[0] = 0xc0 | u8(r>>6);
  65. buf[1] = 0x80 | u8(r) & mask;
  66. return buf, 2;
  67. }
  68. // Invalid or Surrogate range
  69. if i > 0x0010ffff ||
  70. (0xd800 <= i && i <= 0xdfff) {
  71. r = 0xfffd;
  72. }
  73. if i <= 1<<16-1 {
  74. buf[0] = 0xe0 | u8(r>>12);
  75. buf[1] = 0x80 | u8(r>>6) & mask;
  76. buf[2] = 0x80 | u8(r) & mask;
  77. return buf, 3;
  78. }
  79. buf[0] = 0xf0 | u8(r>>18);
  80. buf[1] = 0x80 | u8(r>>12) & mask;
  81. buf[2] = 0x80 | u8(r>>6) & mask;
  82. buf[3] = 0x80 | u8(r) & mask;
  83. return buf, 4;
  84. }
  85. proc decode_rune(s: string) -> (rune, int) #inline { return decode_rune([]u8(s)); }
  86. proc decode_rune(s: []u8) -> (rune, int) {
  87. var n = len(s);
  88. if n < 1 {
  89. return RUNE_ERROR, 0;
  90. }
  91. var s0 = s[0];
  92. var x = accept_sizes[s0];
  93. if x >= 0xF0 {
  94. var mask = rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
  95. return rune(s[0])&~mask | RUNE_ERROR&mask, 1;
  96. }
  97. var sz = x & 7;
  98. var accept = accept_ranges[x>>4];
  99. if n < int(sz) {
  100. return RUNE_ERROR, 1;
  101. }
  102. var b1 = s[1];
  103. if b1 < accept.lo || accept.hi < b1 {
  104. return RUNE_ERROR, 1;
  105. }
  106. if sz == 2 {
  107. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2;
  108. }
  109. var b2 = s[2];
  110. if b2 < LOCB || HICB < b2 {
  111. return RUNE_ERROR, 1;
  112. }
  113. if sz == 3 {
  114. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3;
  115. }
  116. var b3 = s[3];
  117. if b3 < LOCB || HICB < b3 {
  118. return RUNE_ERROR, 1;
  119. }
  120. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4;
  121. }
  122. proc decode_last_rune(s: string) -> (rune, int) #inline { return decode_last_rune([]u8(s)); }
  123. proc decode_last_rune(s: []u8) -> (rune, int) {
  124. var r: rune;
  125. var size: int;
  126. var start, end, limit: int;
  127. end = len(s);
  128. if end == 0 {
  129. return RUNE_ERROR, 0;
  130. }
  131. start = end-1;
  132. r = rune(s[start]);
  133. if r < RUNE_SELF {
  134. return r, 1;
  135. }
  136. limit = max(end - UTF_MAX, 0);
  137. start--;
  138. for start >= limit {
  139. if rune_start(s[start]) {
  140. break;
  141. }
  142. start--;
  143. }
  144. start = max(start, 0);
  145. r, size = decode_rune(s[start..<end]);
  146. if start+size != end {
  147. return RUNE_ERROR, 1;
  148. }
  149. return r, size;
  150. }
  151. proc valid_rune(r: rune) -> bool {
  152. if r < 0 {
  153. return false;
  154. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  155. return false;
  156. } else if r > MAX_RUNE {
  157. return false;
  158. }
  159. return true;
  160. }
  161. proc valid_string(s: string) -> bool {
  162. var n = len(s);
  163. for var i = 0; i < n; {
  164. var si = s[i];
  165. if si < RUNE_SELF { // ascii
  166. i++;
  167. continue;
  168. }
  169. var x = accept_sizes[si];
  170. if x == 0xf1 {
  171. return false;
  172. }
  173. var size = int(x & 7);
  174. if i+size > n {
  175. return false;
  176. }
  177. var ar = accept_ranges[x>>4];
  178. if var b = s[i+1]; b < ar.lo || ar.hi < b {
  179. return false;
  180. } else if size == 2 {
  181. // Okay
  182. } else if var b = s[i+2]; b < 0x80 || 0xbf < b {
  183. return false;
  184. } else if size == 3 {
  185. // Okay
  186. } else if var b = s[i+3]; b < 0x80 || 0xbf < b {
  187. return false;
  188. }
  189. i += size;
  190. }
  191. return true;
  192. }
  193. proc rune_start(b: u8) -> bool #inline { return b&0xc0 != 0x80; }
  194. proc rune_count(s: string) -> int #inline { return rune_count([]u8(s)); }
  195. proc rune_count(s: []u8) -> int {
  196. var count = 0;
  197. var n = len(s);
  198. for var i = 0; i < n; {
  199. defer count++;
  200. var si = s[i];
  201. if si < RUNE_SELF { // ascii
  202. i++;
  203. continue;
  204. }
  205. var x = accept_sizes[si];
  206. if x == 0xf1 {
  207. i++;
  208. continue;
  209. }
  210. var size = int(x & 7);
  211. if i+size > n {
  212. i++;
  213. continue;
  214. }
  215. var ar = accept_ranges[x>>4];
  216. if var b = s[i+1]; b < ar.lo || ar.hi < b {
  217. size = 1;
  218. } else if size == 2 {
  219. // Okay
  220. } else if var b = s[i+2]; b < 0x80 || 0xbf < b {
  221. size = 1;
  222. } else if size == 3 {
  223. // Okay
  224. } else if var b = s[i+3]; b < 0x80 || 0xbf < b {
  225. size = 1;
  226. }
  227. i += size;
  228. }
  229. return count;
  230. }
  231. proc rune_size(r: rune) -> int {
  232. match {
  233. case r < 0: return -1;
  234. case r <= 1<<7 - 1: return 1;
  235. case r <= 1<<11 - 1: return 2;
  236. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
  237. case r <= 1<<16 - 1: return 3;
  238. case r <= MAX_RUNE: return 4;
  239. }
  240. return -1;
  241. }