utf8.odin 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. package utf8
  2. RUNE_ERROR :: '\ufffd'
  3. RUNE_SELF :: 0x80
  4. RUNE_BOM :: 0xfeff
  5. RUNE_EOF :: ~rune(0)
  6. MAX_RUNE :: '\U0010ffff'
  7. UTF_MAX :: 4
  8. SURROGATE_MIN :: 0xd800
  9. SURROGATE_MAX :: 0xdfff
  10. T1 :: 0b0000_0000
  11. TX :: 0b1000_0000
  12. T2 :: 0b1100_0000
  13. T3 :: 0b1110_0000
  14. T4 :: 0b1111_0000
  15. T5 :: 0b1111_1000
  16. MASKX :: 0b0011_1111
  17. MASK2 :: 0b0001_1111
  18. MASK3 :: 0b0000_1111
  19. MASK4 :: 0b0000_0111
  20. RUNE1_MAX :: 1<<7 - 1
  21. RUNE2_MAX :: 1<<11 - 1
  22. RUNE3_MAX :: 1<<16 - 1
  23. // The default lowest and highest continuation byte.
  24. LOCB :: 0b1000_0000
  25. HICB :: 0b1011_1111
  26. Accept_Range :: struct {lo, hi: u8}
  27. accept_ranges := [5]Accept_Range{
  28. {0x80, 0xbf},
  29. {0xa0, 0xbf},
  30. {0x80, 0x9f},
  31. {0x90, 0xbf},
  32. {0x80, 0x8f},
  33. }
  34. accept_sizes := [256]u8{
  35. 0x00..=0x7f = 0xf0,
  36. 0x80..=0xc1 = 0xf1,
  37. 0xc2..=0xdf = 0x02,
  38. 0xe0 = 0x13,
  39. 0xe1..=0xec = 0x03,
  40. 0xed = 0x23,
  41. 0xee..=0xef = 0x03,
  42. 0xf0 = 0x34,
  43. 0xf1..=0xf3 = 0x04,
  44. 0xf4 = 0x44,
  45. 0xf5..=0xff = 0xf1,
  46. }
  47. encode_rune :: proc(c: rune) -> ([4]u8, int) {
  48. r := c
  49. buf: [4]u8
  50. i := u32(r)
  51. mask :: u8(0x3f)
  52. if i <= 1<<7-1 {
  53. buf[0] = u8(r)
  54. return buf, 1
  55. }
  56. if i <= 1<<11-1 {
  57. buf[0] = 0xc0 | u8(r>>6)
  58. buf[1] = 0x80 | u8(r) & mask
  59. return buf, 2
  60. }
  61. // Invalid or Surrogate range
  62. if i > 0x0010ffff ||
  63. (0xd800 <= i && i <= 0xdfff) {
  64. r = 0xfffd
  65. }
  66. if i <= 1<<16-1 {
  67. buf[0] = 0xe0 | u8(r>>12)
  68. buf[1] = 0x80 | u8(r>>6) & mask
  69. buf[2] = 0x80 | u8(r) & mask
  70. return buf, 3
  71. }
  72. buf[0] = 0xf0 | u8(r>>18)
  73. buf[1] = 0x80 | u8(r>>12) & mask
  74. buf[2] = 0x80 | u8(r>>6) & mask
  75. buf[3] = 0x80 | u8(r) & mask
  76. return buf, 4
  77. }
  78. decode_rune :: proc{
  79. decode_rune_in_string,
  80. decode_rune_in_bytes,
  81. }
  82. decode_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
  83. return decode_rune_in_bytes(transmute([]u8)s)
  84. }
  85. decode_rune_in_bytes :: proc(s: []u8) -> (rune, int) {
  86. n := len(s)
  87. if n < 1 {
  88. return RUNE_ERROR, 0
  89. }
  90. s0 := s[0]
  91. x := accept_sizes[s0]
  92. if x >= 0xF0 {
  93. mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff.
  94. return rune(s[0])&~mask | RUNE_ERROR&mask, 1
  95. }
  96. sz := x & 7
  97. accept := accept_ranges[x>>4]
  98. if n < int(sz) {
  99. return RUNE_ERROR, 1
  100. }
  101. b1 := s[1]
  102. if b1 < accept.lo || accept.hi < b1 {
  103. return RUNE_ERROR, 1
  104. }
  105. if sz == 2 {
  106. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2
  107. }
  108. b2 := s[2]
  109. if b2 < LOCB || HICB < b2 {
  110. return RUNE_ERROR, 1
  111. }
  112. if sz == 3 {
  113. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3
  114. }
  115. b3 := s[3]
  116. if b3 < LOCB || HICB < b3 {
  117. return RUNE_ERROR, 1
  118. }
  119. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
  120. }
  121. string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) {
  122. n := rune_count_in_string(s)
  123. runes = make([]rune, n, allocator)
  124. i := 0
  125. for r in s {
  126. runes[i] = r
  127. i += 1
  128. }
  129. return
  130. }
  131. runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string {
  132. byte_count := 0
  133. for r in runes {
  134. _, w := encode_rune(r)
  135. byte_count += w
  136. }
  137. bytes := make([]byte, byte_count, allocator)
  138. offset := 0
  139. for r in runes {
  140. b, w := encode_rune(r)
  141. copy(bytes[offset:], b[:w])
  142. offset += w
  143. }
  144. return string(bytes)
  145. }
  146. decode_last_rune :: proc{
  147. decode_last_rune_in_string,
  148. decode_last_rune_in_bytes,
  149. }
  150. decode_last_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
  151. return decode_last_rune_in_bytes(transmute([]u8)s)
  152. }
  153. decode_last_rune_in_bytes :: proc(s: []u8) -> (rune, int) {
  154. r: rune
  155. size: int
  156. start, end, limit: int
  157. end = len(s)
  158. if end == 0 {
  159. return RUNE_ERROR, 0
  160. }
  161. start = end-1
  162. r = rune(s[start])
  163. if r < RUNE_SELF {
  164. return r, 1
  165. }
  166. limit = max(end - UTF_MAX, 0)
  167. for start-=1; start >= limit; start-=1 {
  168. if rune_start(s[start]) {
  169. break
  170. }
  171. }
  172. start = max(start, 0)
  173. r, size = decode_rune(s[start:end])
  174. if start+size != end {
  175. return RUNE_ERROR, 1
  176. }
  177. return r, size
  178. }
  179. rune_at_pos :: proc(s: string, pos: int) -> rune {
  180. if pos < 0 {
  181. return RUNE_ERROR
  182. }
  183. i := 0
  184. for r in s {
  185. if i == pos {
  186. return r
  187. }
  188. i += 1
  189. }
  190. return RUNE_ERROR
  191. }
  192. rune_string_at_pos :: proc(s: string, pos: int) -> string {
  193. if pos < 0 {
  194. return ""
  195. }
  196. i := 0
  197. for c, offset in s {
  198. if i == pos {
  199. w := rune_size(c)
  200. return s[offset:][:w]
  201. }
  202. i += 1
  203. }
  204. return ""
  205. }
  206. rune_at :: proc(s: string, byte_index: int) -> rune {
  207. r, _ := decode_rune_in_string(s[byte_index:])
  208. return r
  209. }
  210. // Returns the byte position of rune at position pos in s with an optional start byte position.
  211. // Returns -1 if it runs out of the string.
  212. rune_offset :: proc(s: string, pos: int, start: int = 0) -> int {
  213. if pos < 0 {
  214. return -1
  215. }
  216. i := 0
  217. for _, offset in s[start:] {
  218. if i == pos {
  219. return offset+start
  220. }
  221. i += 1
  222. }
  223. return -1
  224. }
  225. valid_rune :: proc(r: rune) -> bool {
  226. if r < 0 {
  227. return false
  228. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  229. return false
  230. } else if r > MAX_RUNE {
  231. return false
  232. }
  233. return true
  234. }
  235. valid_string :: proc(s: string) -> bool {
  236. n := len(s)
  237. for i := 0; i < n; {
  238. si := s[i]
  239. if si < RUNE_SELF { // ascii
  240. i += 1
  241. continue
  242. }
  243. x := accept_sizes[si]
  244. if x == 0xf1 {
  245. return false
  246. }
  247. size := int(x & 7)
  248. if i+size > n {
  249. return false
  250. }
  251. ar := accept_ranges[x>>4]
  252. if b := s[i+1]; b < ar.lo || ar.hi < b {
  253. return false
  254. } else if size == 2 {
  255. // Okay
  256. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  257. return false
  258. } else if size == 3 {
  259. // Okay
  260. } else if d := s[i+3]; b < 0x80 || 0xbf < d {
  261. return false
  262. }
  263. i += size
  264. }
  265. return true
  266. }
  267. rune_start :: #force_inline proc(b: u8) -> bool {
  268. return b&0xc0 != 0x80
  269. }
  270. rune_count :: proc{
  271. rune_count_in_string,
  272. rune_count_in_bytes,
  273. }
  274. rune_count_in_string :: #force_inline proc(s: string) -> int {
  275. return rune_count_in_bytes(transmute([]u8)s)
  276. }
  277. rune_count_in_bytes :: proc(s: []u8) -> int {
  278. count := 0
  279. n := len(s)
  280. for i := 0; i < n; {
  281. defer count += 1
  282. si := s[i]
  283. if si < RUNE_SELF { // ascii
  284. i += 1
  285. continue
  286. }
  287. x := accept_sizes[si]
  288. if x == 0xf1 {
  289. i += 1
  290. continue
  291. }
  292. size := int(x & 7)
  293. if i+size > n {
  294. i += 1
  295. continue
  296. }
  297. ar := accept_ranges[x>>4]
  298. if b := s[i+1]; b < ar.lo || ar.hi < b {
  299. size = 1
  300. } else if size == 2 {
  301. // Okay
  302. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  303. size = 1
  304. } else if size == 3 {
  305. // Okay
  306. } else if d := s[i+3]; d < 0x80 || 0xbf < d {
  307. size = 1
  308. }
  309. i += size
  310. }
  311. return count
  312. }
  313. rune_size :: proc(r: rune) -> int {
  314. switch {
  315. case r < 0: return -1
  316. case r <= 1<<7 - 1: return 1
  317. case r <= 1<<11 - 1: return 2
  318. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
  319. case r <= 1<<16 - 1: return 3
  320. case r <= MAX_RUNE: return 4
  321. }
  322. return -1
  323. }
  324. // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not
  325. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  326. full_rune :: proc{
  327. full_rune_in_bytes,
  328. full_rune_in_string,
  329. }
  330. // full_rune_in_bytes reports if the bytes in b begin with a full utf-8 encoding of a rune or not
  331. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  332. full_rune_in_bytes :: proc(b: []byte) -> bool {
  333. n := len(b)
  334. if n == 0 {
  335. return false
  336. }
  337. x := _first[b[0]]
  338. if n >= int(x & 7) {
  339. return true
  340. }
  341. accept := accept_ranges[x>>4]
  342. if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) {
  343. return true
  344. } else if n > 2 && (b[2] < LOCB || HICB < b[2]) {
  345. return true
  346. }
  347. return false
  348. }
  349. // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not
  350. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  351. full_rune_in_string :: proc(s: string) -> bool {
  352. return full_rune_in_bytes(transmute([]byte)s)
  353. }
  354. _first := [256]u8{
  355. 0x00..=0x7f = 0xf0, // ascii, size 1
  356. 0x80..=0xc1 = 0xf1, // invalid, size 1
  357. 0xc2..=0xdf = 0x02, // accept 1, size 2
  358. 0xe0 = 0x13, // accept 1, size 3
  359. 0xe1..=0xec = 0x03, // accept 0, size 3
  360. 0xed = 0x23, // accept 2, size 3
  361. 0xee..=0xef = 0x03, // accept 0, size 3
  362. 0xf0 = 0x34, // accept 3, size 4
  363. 0xf1..=0xf3 = 0x04, // accept 0, size 4
  364. 0xf4 = 0x44, // accept 4, size 4
  365. 0xf5..=0xff = 0xf1, // ascii, size 1
  366. }