utf8.odin 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. RUNE_ERROR :: '\ufffd'
  2. RUNE_SELF :: 0x80
  3. RUNE_BOM :: 0xfeff
  4. RUNE_EOF :: ~(0 as rune)
  5. MAX_RUNE :: '\U0010ffff'
  6. UTF_MAX :: 4
  7. SURROGATE_MIN :: 0xd800
  8. SURROGATE_MAX :: 0xdfff
  9. Accept_Range :: struct {
  10. lo, hi: u8
  11. }
  12. accept_ranges := [5]Accept_Range{
  13. {0x80, 0xbf},
  14. {0xa0, 0xbf},
  15. {0x80, 0x9f},
  16. {0x90, 0xbf},
  17. {0x80, 0x8f},
  18. }
  19. accept_sizes := [256]byte{
  20. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
  21. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
  22. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
  23. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
  24. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
  25. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
  26. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
  27. 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
  28. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
  29. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
  30. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
  31. 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
  32. 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
  33. 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
  34. 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
  35. 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
  36. }
  37. encode_rune :: proc(r_: rune) -> ([4]byte, int) {
  38. r := r_
  39. buf: [4]byte
  40. i := r as u32
  41. mask :: 0x3f as byte
  42. if i <= 1<<7-1 {
  43. buf[0] = r as byte
  44. return buf, 1
  45. }
  46. if i <= 1<<11-1 {
  47. buf[0] = 0xc0 | (r>>6) as byte
  48. buf[1] = 0x80 | (r) as byte & mask
  49. return buf, 2
  50. }
  51. // Invalid or Surrogate range
  52. if i > 0x0010ffff ||
  53. (0xd800 <= i && i <= 0xdfff) {
  54. r = 0xfffd
  55. }
  56. if i <= 1<<16-1 {
  57. buf[0] = 0xe0 | (r>>12) as byte
  58. buf[1] = 0x80 | (r>>6) as byte & mask
  59. buf[2] = 0x80 | (r) as byte & mask
  60. return buf, 3
  61. }
  62. buf[0] = 0xf0 | (r>>18) as byte
  63. buf[1] = 0x80 | (r>>12) as byte & mask
  64. buf[2] = 0x80 | (r>>6) as byte & mask
  65. buf[3] = 0x80 | (r) as byte & mask
  66. return buf, 4
  67. }
  68. decode_rune :: proc(s: string) -> (rune, int) {
  69. n := s.count
  70. if n < 1 {
  71. return RUNE_ERROR, 0
  72. }
  73. b0 := s[0]
  74. x := accept_sizes[b0]
  75. if x >= 0xf0 {
  76. mask := (x as rune << 31) >> 31; // all zeros or all ones
  77. return (b0 as rune) &~ mask | RUNE_ERROR&mask, 1
  78. }
  79. size := x & 7
  80. ar := accept_ranges[x>>4]
  81. if n < size as int {
  82. return RUNE_ERROR, 1
  83. }
  84. b1 := s[1]
  85. if b1 < ar.lo || ar.hi < b1 {
  86. return RUNE_ERROR, 1
  87. }
  88. MASK_X :: 0b00111111
  89. MASK_2 :: 0b00011111
  90. MASK_3 :: 0b00001111
  91. MASK_4 :: 0b00000111
  92. if size == 2 {
  93. return (b0&MASK_2) as rune <<6 | (b1&MASK_X) as rune, 2
  94. }
  95. b2 := s[2]
  96. if b2 < 0x80 || 0xbf < b2 {
  97. return RUNE_ERROR, 1
  98. }
  99. if size == 3 {
  100. return (b0&MASK_3) as rune <<12 | (b1&MASK_X) as rune <<6 | (b2&MASK_X) as rune, 3
  101. }
  102. b3 := s[3]
  103. if b3 < 0x80 || 0xbf < b3 {
  104. return RUNE_ERROR, 1
  105. }
  106. return (b0&MASK_4) as rune <<18 | (b1&MASK_X) as rune <<12 | (b3&MASK_X) as rune <<6 | (b3&MASK_X) as rune, 4
  107. }
  108. valid_rune :: proc(r: rune) -> bool {
  109. if r < 0 {
  110. return false
  111. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  112. return false
  113. } else if r > MAX_RUNE {
  114. return false
  115. }
  116. return true
  117. }
  118. valid_string :: proc(s: string) -> bool {
  119. n := s.count
  120. for i := 0; i < n; {
  121. si := s[i]
  122. if si < RUNE_SELF { // ascii
  123. i++
  124. continue
  125. }
  126. x := accept_sizes[si]
  127. if x == 0xf1 {
  128. return false
  129. }
  130. size := (x & 7) as int
  131. if i+size > n {
  132. return false
  133. }
  134. ar := accept_ranges[x>>4]
  135. if b := s[i+1]; b < ar.lo || ar.hi < b {
  136. return false
  137. } else if size == 2 {
  138. // Okay
  139. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  140. return false
  141. } else if size == 3 {
  142. // Okay
  143. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  144. return false
  145. }
  146. i += size
  147. }
  148. return true
  149. }
  150. rune_count :: proc(s: string) -> int {
  151. count := 0
  152. n := s.count
  153. for i := 0; i < n; count++ {
  154. si := s[i]
  155. if si < RUNE_SELF { // ascii
  156. i++
  157. continue
  158. }
  159. x := accept_sizes[si]
  160. if x == 0xf1 {
  161. i++
  162. continue
  163. }
  164. size := (x & 7) as int
  165. if i+size > n {
  166. i++
  167. continue
  168. }
  169. ar := accept_ranges[x>>4]
  170. if b := s[i+1]; b < ar.lo || ar.hi < b {
  171. size = 1
  172. } else if size == 2 {
  173. // Okay
  174. } else if b := s[i+2]; b < 0x80 || 0xbf < b {
  175. size = 1
  176. } else if size == 3 {
  177. // Okay
  178. } else if b := s[i+3]; b < 0x80 || 0xbf < b {
  179. size = 1
  180. }
  181. i += size
  182. }
  183. return count
  184. }
  185. rune_size :: proc(r: rune) -> int {
  186. match {
  187. case r < 0: return -1
  188. case r <= 1<<7 - 1: return 1
  189. case r <= 1<<11 - 1: return 2
  190. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
  191. case r <= 1<<16 - 1: return 3
  192. case r <= MAX_RUNE: return 4
  193. }
  194. return -1
  195. }