utf8.odin 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. RUNE_ERROR :: '\ufffd';
  2. RUNE_SELF :: 0x80;
  3. RUNE_BOM :: 0xfeff;
  4. RUNE_EOF :: ~(0 as rune);
  5. MAX_RUNE :: '\U0010ffff';
  6. UTF_MAX :: 4;
  7. SURROGATE_MIN :: 0xd800;
  8. SURROGATE_MAX :: 0xdfff;
  9. Accept_Range :: struct {
  10. lo, hi: u8;
  11. }
  12. accept_ranges := [5]Accept_Range{
  13. {0x80, 0xbf},
  14. {0xa0, 0xbf},
  15. {0x80, 0x9f},
  16. {0x90, 0xbf},
  17. {0x80, 0x8f},
  18. };
  19. accept_sizes := [256]byte{
  20. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
  21. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
  22. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
  23. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
  24. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
  25. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
  26. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
  27. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
  28. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
  29. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
  30. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
  31. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
  32. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
  33. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
  34. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
  35. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
  36. };
  37. encode_rune :: proc(r: rune) -> ([4]byte, int) {
  38. buf: [4]byte;
  39. i := r as u32;
  40. mask: byte : 0x3f;
  41. if i <= 1<<7-1 {
  42. buf[0] = r as byte;
  43. return buf, 1;
  44. }
  45. if i <= 1<<11-1 {
  46. buf[0] = 0xc0 | (r>>6) as byte;
  47. buf[1] = 0x80 | (r) as byte & mask;
  48. return buf, 2;
  49. }
  50. // Invalid or Surrogate range
  51. if i > 0x0010ffff ||
  52. (0xd800 <= i && i <= 0xdfff) {
  53. r = 0xfffd;
  54. }
  55. if i <= 1<<16-1 {
  56. buf[0] = 0xe0 | (r>>12) as byte;
  57. buf[1] = 0x80 | (r>>6) as byte & mask;
  58. buf[2] = 0x80 | (r) as byte & mask;
  59. return buf, 3;
  60. }
  61. buf[0] = 0xf0 | (r>>18) as byte;
  62. buf[1] = 0x80 | (r>>12) as byte & mask;
  63. buf[2] = 0x80 | (r>>6) as byte & mask;
  64. buf[3] = 0x80 | (r) as byte & mask;
  65. return buf, 4;
  66. }
  67. decode_rune :: proc(s: string) -> (rune, int) {
  68. n := s.count;
  69. if n < 1 {
  70. return RUNE_ERROR, 0;
  71. }
  72. b0 := s[0];
  73. x := accept_sizes[b0];
  74. if x >= 0xf0 {
  75. mask := (x as rune << 31) >> 31; // all zeros or all ones
  76. return (b0 as rune) &~ mask | RUNE_ERROR&mask, 1;
  77. }
  78. size := x & 7;
  79. ar := accept_ranges[x>>4];
  80. if n < size as int {
  81. return RUNE_ERROR, 1;
  82. }
  83. b1 := s[1];
  84. if b1 < ar.lo || ar.hi < b1 {
  85. return RUNE_ERROR, 1;
  86. }
  87. MASK_X :: 0b00111111;
  88. MASK_2 :: 0b00011111;
  89. MASK_3 :: 0b00001111;
  90. MASK_4 :: 0b00000111;
  91. if size == 2 {
  92. return (b0&MASK_2) as rune <<6 | (b1&MASK_X) as rune, 2;
  93. }
  94. b2 := s[2];
  95. if b2 < 0x80 || 0xbf < b2 {
  96. return RUNE_ERROR, 1;
  97. }
  98. if size == 3 {
  99. return (b0&MASK_3) as rune <<12 | (b1&MASK_X) as rune <<6 | (b2&MASK_X) as rune, 3;
  100. }
  101. b3 := s[3];
  102. if b3 < 0x80 || 0xbf < b3 {
  103. return RUNE_ERROR, 1;
  104. }
  105. return (b0&MASK_4) as rune <<18 | (b1&MASK_X) as rune <<12 | (b3&MASK_X) as rune <<6 | (b3&MASK_X) as rune, 4;
  106. }
  107. valid_rune :: proc(r: rune) -> bool {
  108. if r < 0 {
  109. return false;
  110. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  111. return false;
  112. } else if r > MAX_RUNE {
  113. return false;
  114. }
  115. return true;
  116. }
  117. valid_string :: proc(s: string) -> bool {
  118. n := s.count;
  119. i := 0;
  120. while i < n {
  121. si := s[i];
  122. if si < RUNE_SELF { // ascii
  123. i += 1;
  124. continue;
  125. }
  126. x := accept_sizes[si];
  127. if x == 0xf1 {
  128. return false;
  129. }
  130. size := (x & 7) as int;
  131. if i+size > n {
  132. return false;
  133. }
  134. ar := accept_ranges[x>>4];
  135. if b := s[i+1]; b < ar.lo || ar.hi < b {
  136. return false;
  137. } else if size == 2 {
  138. // Okay
  139. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  140. return false;
  141. } else if size == 3 {
  142. // Okay
  143. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  144. return false;
  145. }
  146. i += size;
  147. }
  148. return true;
  149. }
  150. rune_count :: proc(s: string) -> int {
  151. count := 0;
  152. n := s.count;
  153. i := 0;
  154. while i < n {
  155. defer count += 1;
  156. si := s[i];
  157. if si < RUNE_SELF { // ascii
  158. i += 1;
  159. continue;
  160. }
  161. x := accept_sizes[si];
  162. if x == 0xf1 {
  163. i += 1;
  164. continue;
  165. }
  166. size := (x & 7) as int;
  167. if i+size > n {
  168. i += 1;
  169. continue;
  170. }
  171. ar := accept_ranges[x>>4];
  172. if b := s[i+1]; b < ar.lo || ar.hi < b {
  173. size = 1;
  174. } else if size == 2 {
  175. // Okay
  176. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  177. size = 1;
  178. } else if size == 3 {
  179. // Okay
  180. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  181. size = 1;
  182. }
  183. i += size;
  184. }
  185. return count;
  186. }
  187. rune_size :: proc(r: rune) -> int {
  188. match {
  189. case r < 0: return -1;
  190. case r <= 1<<7 - 1: return 1;
  191. case r <= 1<<11 - 1: return 2;
  192. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
  193. case r <= 1<<16 - 1: return 3;
  194. case r <= MAX_RUNE: return 4;
  195. }
  196. return -1;
  197. }