utf8.odin 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. package utf8
  2. RUNE_ERROR :: '\ufffd';
  3. RUNE_SELF :: 0x80;
  4. RUNE_BOM :: 0xfeff;
  5. RUNE_EOF :: ~rune(0);
  6. MAX_RUNE :: '\U0010ffff';
  7. UTF_MAX :: 4;
  8. SURROGATE_MIN :: 0xd800;
  9. SURROGATE_MAX :: 0xdfff;
  10. T1 :: 0b0000_0000;
  11. TX :: 0b1000_0000;
  12. T2 :: 0b1100_0000;
  13. T3 :: 0b1110_0000;
  14. T4 :: 0b1111_0000;
  15. T5 :: 0b1111_1000;
  16. MASKX :: 0b0011_1111;
  17. MASK2 :: 0b0001_1111;
  18. MASK3 :: 0b0000_1111;
  19. MASK4 :: 0b0000_0111;
  20. RUNE1_MAX :: 1<<7 - 1;
  21. RUNE2_MAX :: 1<<11 - 1;
  22. RUNE3_MAX :: 1<<16 - 1;
  23. // The default lowest and highest continuation byte.
  24. LOCB :: 0b1000_0000;
  25. HICB :: 0b1011_1111;
  26. Accept_Range :: struct {lo, hi: u8};
  27. accept_ranges := [5]Accept_Range{
  28. {0x80, 0xbf},
  29. {0xa0, 0xbf},
  30. {0x80, 0x9f},
  31. {0x90, 0xbf},
  32. {0x80, 0x8f},
  33. };
  34. accept_sizes := [256]u8{
  35. 0x00..0x7f = 0xf0,
  36. 0x80..0xc1 = 0xf1,
  37. 0xc2..0xdf = 0x02,
  38. 0xe0 = 0x13,
  39. 0xe1..0xec = 0x03,
  40. 0xed = 0x23,
  41. 0xee..0xef = 0x03,
  42. 0xf0 = 0x34,
  43. 0xf1..0xf3 = 0x04,
  44. 0xf4 = 0x44,
  45. 0xf5..0xff = 0xf1,
  46. };
  47. encode_rune :: proc(c: rune) -> ([4]u8, int) {
  48. r := c;
  49. buf: [4]u8;
  50. i := u32(r);
  51. mask :: u8(0x3f);
  52. if i <= 1<<7-1 {
  53. buf[0] = u8(r);
  54. return buf, 1;
  55. }
  56. if i <= 1<<11-1 {
  57. buf[0] = 0xc0 | u8(r>>6);
  58. buf[1] = 0x80 | u8(r) & mask;
  59. return buf, 2;
  60. }
  61. // Invalid or Surrogate range
  62. if i > 0x0010ffff ||
  63. (0xd800 <= i && i <= 0xdfff) {
  64. r = 0xfffd;
  65. }
  66. if i <= 1<<16-1 {
  67. buf[0] = 0xe0 | u8(r>>12);
  68. buf[1] = 0x80 | u8(r>>6) & mask;
  69. buf[2] = 0x80 | u8(r) & mask;
  70. return buf, 3;
  71. }
  72. buf[0] = 0xf0 | u8(r>>18);
  73. buf[1] = 0x80 | u8(r>>12) & mask;
  74. buf[2] = 0x80 | u8(r>>6) & mask;
  75. buf[3] = 0x80 | u8(r) & mask;
  76. return buf, 4;
  77. }
  78. decode_rune_in_string :: inline proc(s: string) -> (rune, int) do return decode_rune(transmute([]u8)s);
  79. decode_rune :: proc(s: []u8) -> (rune, int) {
  80. n := len(s);
  81. if n < 1 {
  82. return RUNE_ERROR, 0;
  83. }
  84. s0 := s[0];
  85. x := accept_sizes[s0];
  86. if x >= 0xF0 {
  87. mask := rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
  88. return rune(s[0])&~mask | RUNE_ERROR&mask, 1;
  89. }
  90. sz := x & 7;
  91. accept := accept_ranges[x>>4];
  92. if n < int(sz) {
  93. return RUNE_ERROR, 1;
  94. }
  95. b1 := s[1];
  96. if b1 < accept.lo || accept.hi < b1 {
  97. return RUNE_ERROR, 1;
  98. }
  99. if sz == 2 {
  100. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2;
  101. }
  102. b2 := s[2];
  103. if b2 < LOCB || HICB < b2 {
  104. return RUNE_ERROR, 1;
  105. }
  106. if sz == 3 {
  107. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3;
  108. }
  109. b3 := s[3];
  110. if b3 < LOCB || HICB < b3 {
  111. return RUNE_ERROR, 1;
  112. }
  113. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4;
  114. }
  115. string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) {
  116. n := rune_count_in_string(s);
  117. runes = make([]rune, n, allocator);
  118. i := 0;
  119. for r in s {
  120. runes[i] = r;
  121. i += 1;
  122. }
  123. return;
  124. }
  125. runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string {
  126. byte_count := 0;
  127. for r in runes {
  128. _, w := encode_rune(r);
  129. byte_count += w;
  130. }
  131. bytes := make([]byte, byte_count, allocator);
  132. offset := 0;
  133. for r in runes {
  134. b, w := encode_rune(r);
  135. copy(bytes[offset:], b[:w]);
  136. offset += w;
  137. }
  138. return string(bytes);
  139. }
  140. decode_last_rune_in_string :: inline proc(s: string) -> (rune, int) do return decode_last_rune(transmute([]u8)s);
  141. decode_last_rune :: proc(s: []u8) -> (rune, int) {
  142. r: rune;
  143. size: int;
  144. start, end, limit: int;
  145. end = len(s);
  146. if end == 0 {
  147. return RUNE_ERROR, 0;
  148. }
  149. start = end-1;
  150. r = rune(s[start]);
  151. if r < RUNE_SELF {
  152. return r, 1;
  153. }
  154. limit = max(end - UTF_MAX, 0);
  155. for start-=1; start >= limit; start-=1 {
  156. if rune_start(s[start]) do break;
  157. }
  158. start = max(start, 0);
  159. r, size = decode_rune(s[start:end]);
  160. if start+size != end {
  161. return RUNE_ERROR, 1;
  162. }
  163. return r, size;
  164. }
  165. rune_at_pos :: proc(s: string, pos: int) -> rune {
  166. if pos < 0 {
  167. return RUNE_ERROR;
  168. }
  169. i := 0;
  170. for r in s {
  171. if i == pos {
  172. return r;
  173. }
  174. i += 1;
  175. }
  176. return RUNE_ERROR;
  177. }
  178. rune_string_at_pos :: proc(s: string, pos: int) -> string {
  179. if pos < 0 {
  180. return "";
  181. }
  182. i := 0;
  183. for c, offset in s {
  184. if i == pos {
  185. w := rune_size(c);
  186. return s[offset:][:w];
  187. }
  188. i += 1;
  189. }
  190. return "";
  191. }
  192. rune_at :: proc(s: string, byte_index: int) -> rune {
  193. r, _ := decode_rune_in_string(s[byte_index:]);
  194. return r;
  195. }
  196. // Returns the byte position of rune at position pos in s with an optional start byte position.
  197. // Returns -1 if it runs out of the string.
  198. rune_offset :: proc(s: string, pos: int, start: int = 0) -> int {
  199. if pos < 0 {
  200. return -1;
  201. }
  202. i := 0;
  203. for _, offset in s[start:] {
  204. if i == pos {
  205. return offset+start;
  206. }
  207. i += 1;
  208. }
  209. return -1;
  210. }
  211. valid_rune :: proc(r: rune) -> bool {
  212. if r < 0 {
  213. return false;
  214. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  215. return false;
  216. } else if r > MAX_RUNE {
  217. return false;
  218. }
  219. return true;
  220. }
  221. valid_string :: proc(s: string) -> bool {
  222. n := len(s);
  223. for i := 0; i < n; {
  224. si := s[i];
  225. if si < RUNE_SELF { // ascii
  226. i += 1;
  227. continue;
  228. }
  229. x := accept_sizes[si];
  230. if x == 0xf1 {
  231. return false;
  232. }
  233. size := int(x & 7);
  234. if i+size > n {
  235. return false;
  236. }
  237. ar := accept_ranges[x>>4];
  238. if b := s[i+1]; b < ar.lo || ar.hi < b {
  239. return false;
  240. } else if size == 2 {
  241. // Okay
  242. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  243. return false;
  244. } else if size == 3 {
  245. // Okay
  246. } else if d := s[i+3]; b < 0x80 || 0xbf < d {
  247. return false;
  248. }
  249. i += size;
  250. }
  251. return true;
  252. }
  253. rune_start :: inline proc(b: u8) -> bool do return b&0xc0 != 0x80;
  254. rune_count_in_string :: inline proc(s: string) -> int do return rune_count(transmute([]u8)s);
  255. rune_count :: proc(s: []u8) -> int {
  256. count := 0;
  257. n := len(s);
  258. for i := 0; i < n; {
  259. defer count += 1;
  260. si := s[i];
  261. if si < RUNE_SELF { // ascii
  262. i += 1;
  263. continue;
  264. }
  265. x := accept_sizes[si];
  266. if x == 0xf1 {
  267. i += 1;
  268. continue;
  269. }
  270. size := int(x & 7);
  271. if i+size > n {
  272. i += 1;
  273. continue;
  274. }
  275. ar := accept_ranges[x>>4];
  276. if b := s[i+1]; b < ar.lo || ar.hi < b {
  277. size = 1;
  278. } else if size == 2 {
  279. // Okay
  280. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  281. size = 1;
  282. } else if size == 3 {
  283. // Okay
  284. } else if d := s[i+3]; d < 0x80 || 0xbf < d {
  285. size = 1;
  286. }
  287. i += size;
  288. }
  289. return count;
  290. }
  291. rune_size :: proc(r: rune) -> int {
  292. switch {
  293. case r < 0: return -1;
  294. case r <= 1<<7 - 1: return 1;
  295. case r <= 1<<11 - 1: return 2;
  296. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
  297. case r <= 1<<16 - 1: return 3;
  298. case r <= MAX_RUNE: return 4;
  299. }
  300. return -1;
  301. }