utf8.odin 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. package utf8
  2. RUNE_ERROR :: '\ufffd'
  3. RUNE_SELF :: 0x80
  4. RUNE_BOM :: 0xfeff
  5. RUNE_EOF :: ~rune(0)
  6. MAX_RUNE :: '\U0010ffff'
  7. UTF_MAX :: 4
  8. SURROGATE_MIN :: 0xd800
  9. SURROGATE_MAX :: 0xdfff
  10. T1 :: 0b0000_0000
  11. TX :: 0b1000_0000
  12. T2 :: 0b1100_0000
  13. T3 :: 0b1110_0000
  14. T4 :: 0b1111_0000
  15. T5 :: 0b1111_1000
  16. MASKX :: 0b0011_1111
  17. MASK2 :: 0b0001_1111
  18. MASK3 :: 0b0000_1111
  19. MASK4 :: 0b0000_0111
  20. RUNE1_MAX :: 1<<7 - 1
  21. RUNE2_MAX :: 1<<11 - 1
  22. RUNE3_MAX :: 1<<16 - 1
  23. // The default lowest and highest continuation byte.
  24. LOCB :: 0b1000_0000
  25. HICB :: 0b1011_1111
  26. Accept_Range :: struct {lo, hi: u8}
  27. accept_ranges := [5]Accept_Range{
  28. {0x80, 0xbf},
  29. {0xa0, 0xbf},
  30. {0x80, 0x9f},
  31. {0x90, 0xbf},
  32. {0x80, 0x8f},
  33. }
  34. accept_sizes := [256]u8{
  35. 0x00..=0x7f = 0xf0,
  36. 0x80..=0xc1 = 0xf1,
  37. 0xc2..=0xdf = 0x02,
  38. 0xe0 = 0x13,
  39. 0xe1..=0xec = 0x03,
  40. 0xed = 0x23,
  41. 0xee..=0xef = 0x03,
  42. 0xf0 = 0x34,
  43. 0xf1..=0xf3 = 0x04,
  44. 0xf4 = 0x44,
  45. 0xf5..=0xff = 0xf1,
  46. }
  47. encode_rune :: proc(c: rune) -> ([4]u8, int) {
  48. r := c
  49. buf: [4]u8
  50. i := u32(r)
  51. mask :: u8(0x3f)
  52. if i <= 1<<7-1 {
  53. buf[0] = u8(r)
  54. return buf, 1
  55. }
  56. if i <= 1<<11-1 {
  57. buf[0] = 0xc0 | u8(r>>6)
  58. buf[1] = 0x80 | u8(r) & mask
  59. return buf, 2
  60. }
  61. // Invalid or Surrogate range
  62. if i > 0x0010ffff ||
  63. (0xd800 <= i && i <= 0xdfff) {
  64. r = 0xfffd
  65. }
  66. if i <= 1<<16-1 {
  67. buf[0] = 0xe0 | u8(r>>12)
  68. buf[1] = 0x80 | u8(r>>6) & mask
  69. buf[2] = 0x80 | u8(r) & mask
  70. return buf, 3
  71. }
  72. buf[0] = 0xf0 | u8(r>>18)
  73. buf[1] = 0x80 | u8(r>>12) & mask
  74. buf[2] = 0x80 | u8(r>>6) & mask
  75. buf[3] = 0x80 | u8(r) & mask
  76. return buf, 4
  77. }
  78. decode_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
  79. return decode_rune(transmute([]u8)s)
  80. }
  81. decode_rune :: proc(s: []u8) -> (rune, int) {
  82. n := len(s)
  83. if n < 1 {
  84. return RUNE_ERROR, 0
  85. }
  86. s0 := s[0]
  87. x := accept_sizes[s0]
  88. if x >= 0xF0 {
  89. mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff.
  90. return rune(s[0])&~mask | RUNE_ERROR&mask, 1
  91. }
  92. sz := x & 7
  93. accept := accept_ranges[x>>4]
  94. if n < int(sz) {
  95. return RUNE_ERROR, 1
  96. }
  97. b1 := s[1]
  98. if b1 < accept.lo || accept.hi < b1 {
  99. return RUNE_ERROR, 1
  100. }
  101. if sz == 2 {
  102. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2
  103. }
  104. b2 := s[2]
  105. if b2 < LOCB || HICB < b2 {
  106. return RUNE_ERROR, 1
  107. }
  108. if sz == 3 {
  109. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3
  110. }
  111. b3 := s[3]
  112. if b3 < LOCB || HICB < b3 {
  113. return RUNE_ERROR, 1
  114. }
  115. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
  116. }
  117. string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) {
  118. n := rune_count_in_string(s)
  119. runes = make([]rune, n, allocator)
  120. i := 0
  121. for r in s {
  122. runes[i] = r
  123. i += 1
  124. }
  125. return
  126. }
  127. runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string {
  128. byte_count := 0
  129. for r in runes {
  130. _, w := encode_rune(r)
  131. byte_count += w
  132. }
  133. bytes := make([]byte, byte_count, allocator)
  134. offset := 0
  135. for r in runes {
  136. b, w := encode_rune(r)
  137. copy(bytes[offset:], b[:w])
  138. offset += w
  139. }
  140. return string(bytes)
  141. }
  142. decode_last_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
  143. return decode_last_rune(transmute([]u8)s)
  144. }
  145. decode_last_rune :: proc(s: []u8) -> (rune, int) {
  146. r: rune
  147. size: int
  148. start, end, limit: int
  149. end = len(s)
  150. if end == 0 {
  151. return RUNE_ERROR, 0
  152. }
  153. start = end-1
  154. r = rune(s[start])
  155. if r < RUNE_SELF {
  156. return r, 1
  157. }
  158. limit = max(end - UTF_MAX, 0)
  159. for start-=1; start >= limit; start-=1 {
  160. if rune_start(s[start]) {
  161. break
  162. }
  163. }
  164. start = max(start, 0)
  165. r, size = decode_rune(s[start:end])
  166. if start+size != end {
  167. return RUNE_ERROR, 1
  168. }
  169. return r, size
  170. }
  171. rune_at_pos :: proc(s: string, pos: int) -> rune {
  172. if pos < 0 {
  173. return RUNE_ERROR
  174. }
  175. i := 0
  176. for r in s {
  177. if i == pos {
  178. return r
  179. }
  180. i += 1
  181. }
  182. return RUNE_ERROR
  183. }
  184. rune_string_at_pos :: proc(s: string, pos: int) -> string {
  185. if pos < 0 {
  186. return ""
  187. }
  188. i := 0
  189. for c, offset in s {
  190. if i == pos {
  191. w := rune_size(c)
  192. return s[offset:][:w]
  193. }
  194. i += 1
  195. }
  196. return ""
  197. }
  198. rune_at :: proc(s: string, byte_index: int) -> rune {
  199. r, _ := decode_rune_in_string(s[byte_index:])
  200. return r
  201. }
  202. // Returns the byte position of rune at position pos in s with an optional start byte position.
  203. // Returns -1 if it runs out of the string.
  204. rune_offset :: proc(s: string, pos: int, start: int = 0) -> int {
  205. if pos < 0 {
  206. return -1
  207. }
  208. i := 0
  209. for _, offset in s[start:] {
  210. if i == pos {
  211. return offset+start
  212. }
  213. i += 1
  214. }
  215. return -1
  216. }
  217. valid_rune :: proc(r: rune) -> bool {
  218. if r < 0 {
  219. return false
  220. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  221. return false
  222. } else if r > MAX_RUNE {
  223. return false
  224. }
  225. return true
  226. }
  227. valid_string :: proc(s: string) -> bool {
  228. n := len(s)
  229. for i := 0; i < n; {
  230. si := s[i]
  231. if si < RUNE_SELF { // ascii
  232. i += 1
  233. continue
  234. }
  235. x := accept_sizes[si]
  236. if x == 0xf1 {
  237. return false
  238. }
  239. size := int(x & 7)
  240. if i+size > n {
  241. return false
  242. }
  243. ar := accept_ranges[x>>4]
  244. if b := s[i+1]; b < ar.lo || ar.hi < b {
  245. return false
  246. } else if size == 2 {
  247. // Okay
  248. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  249. return false
  250. } else if size == 3 {
  251. // Okay
  252. } else if d := s[i+3]; b < 0x80 || 0xbf < d {
  253. return false
  254. }
  255. i += size
  256. }
  257. return true
  258. }
  259. rune_start :: #force_inline proc(b: u8) -> bool {
  260. return b&0xc0 != 0x80
  261. }
  262. rune_count_in_string :: #force_inline proc(s: string) -> int {
  263. return rune_count(transmute([]u8)s)
  264. }
  265. rune_count :: proc(s: []u8) -> int {
  266. count := 0
  267. n := len(s)
  268. for i := 0; i < n; {
  269. defer count += 1
  270. si := s[i]
  271. if si < RUNE_SELF { // ascii
  272. i += 1
  273. continue
  274. }
  275. x := accept_sizes[si]
  276. if x == 0xf1 {
  277. i += 1
  278. continue
  279. }
  280. size := int(x & 7)
  281. if i+size > n {
  282. i += 1
  283. continue
  284. }
  285. ar := accept_ranges[x>>4]
  286. if b := s[i+1]; b < ar.lo || ar.hi < b {
  287. size = 1
  288. } else if size == 2 {
  289. // Okay
  290. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  291. size = 1
  292. } else if size == 3 {
  293. // Okay
  294. } else if d := s[i+3]; d < 0x80 || 0xbf < d {
  295. size = 1
  296. }
  297. i += size
  298. }
  299. return count
  300. }
  301. rune_size :: proc(r: rune) -> int {
  302. switch {
  303. case r < 0: return -1
  304. case r <= 1<<7 - 1: return 1
  305. case r <= 1<<11 - 1: return 2
  306. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
  307. case r <= 1<<16 - 1: return 3
  308. case r <= MAX_RUNE: return 4
  309. }
  310. return -1
  311. }
  312. // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not
  313. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  314. full_rune :: proc(b: []byte) -> bool {
  315. n := len(b)
  316. if n == 0 {
  317. return false
  318. }
  319. x := _first[b[0]]
  320. if n >= int(x & 7) {
  321. return true
  322. }
  323. accept := accept_ranges[x>>4]
  324. if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) {
  325. return true
  326. } else if n > 2 && (b[2] < LOCB || HICB < b[2]) {
  327. return true
  328. }
  329. return false
  330. }
  331. // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not
  332. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  333. full_rune_in_string :: proc(s: string) -> bool {
  334. return full_rune(transmute([]byte)s)
  335. }
  336. _first := [256]u8{
  337. 0x00..=0x7f = 0xf0, // ascii, size 1
  338. 0x80..=0xc1 = 0xf1, // invalid, size 1
  339. 0xc2..=0xdf = 0x02, // accept 1, size 2
  340. 0xe0 = 0x13, // accept 1, size 3
  341. 0xe1..=0xec = 0x03, // accept 0, size 3
  342. 0xed = 0x23, // accept 2, size 3
  343. 0xee..=0xef = 0x03, // accept 0, size 3
  344. 0xf0 = 0x34, // accept 3, size 4
  345. 0xf1..=0xf3 = 0x04, // accept 0, size 4
  346. 0xf4 = 0x44, // accept 4, size 4
  347. 0xf5..=0xff = 0xf1, // ascii, size 1
  348. }