utf8.odin 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. package utf8
  2. RUNE_ERROR :: '\ufffd'
  3. RUNE_SELF :: 0x80
  4. RUNE_BOM :: 0xfeff
  5. RUNE_EOF :: ~rune(0)
  6. MAX_RUNE :: '\U0010ffff'
  7. UTF_MAX :: 4
  8. SURROGATE_MIN :: 0xd800
  9. SURROGATE_MAX :: 0xdfff
  10. // A high/leading surrogate is in range SURROGATE_MIN..SURROGATE_HIGH_MAX,
  11. // A low/trailing surrogate is in range SURROGATE_LOW_MIN..SURROGATE_MAX.
  12. SURROGATE_HIGH_MAX :: 0xdbff
  13. SURROGATE_LOW_MIN :: 0xdc00
  14. T1 :: 0b0000_0000
  15. TX :: 0b1000_0000
  16. T2 :: 0b1100_0000
  17. T3 :: 0b1110_0000
  18. T4 :: 0b1111_0000
  19. T5 :: 0b1111_1000
  20. MASKX :: 0b0011_1111
  21. MASK2 :: 0b0001_1111
  22. MASK3 :: 0b0000_1111
  23. MASK4 :: 0b0000_0111
  24. RUNE1_MAX :: 1<<7 - 1
  25. RUNE2_MAX :: 1<<11 - 1
  26. RUNE3_MAX :: 1<<16 - 1
  27. // The default lowest and highest continuation byte.
  28. LOCB :: 0b1000_0000
  29. HICB :: 0b1011_1111
  30. Accept_Range :: struct {lo, hi: u8}
  31. accept_ranges := [5]Accept_Range{
  32. {0x80, 0xbf},
  33. {0xa0, 0xbf},
  34. {0x80, 0x9f},
  35. {0x90, 0xbf},
  36. {0x80, 0x8f},
  37. }
  38. accept_sizes := [256]u8{
  39. 0x00..=0x7f = 0xf0, // ascii, size 1
  40. 0x80..=0xc1 = 0xf1, // invalid, size 1
  41. 0xc2..=0xdf = 0x02, // accept 1, size 2
  42. 0xe0 = 0x13, // accept 1, size 3
  43. 0xe1..=0xec = 0x03, // accept 0, size 3
  44. 0xed = 0x23, // accept 2, size 3
  45. 0xee..=0xef = 0x03, // accept 0, size 3
  46. 0xf0 = 0x34, // accept 3, size 4
  47. 0xf1..=0xf3 = 0x04, // accept 0, size 4
  48. 0xf4 = 0x44, // accept 4, size 4
  49. 0xf5..=0xff = 0xf1, // ascii, size 1
  50. }
  51. encode_rune :: proc "contextless" (c: rune) -> ([4]u8, int) {
  52. r := c
  53. buf: [4]u8
  54. i := u32(r)
  55. mask :: u8(0x3f)
  56. if i <= 1<<7-1 {
  57. buf[0] = u8(r)
  58. return buf, 1
  59. }
  60. if i <= 1<<11-1 {
  61. buf[0] = 0xc0 | u8(r>>6)
  62. buf[1] = 0x80 | u8(r) & mask
  63. return buf, 2
  64. }
  65. // Invalid or Surrogate range
  66. if i > 0x0010ffff ||
  67. (0xd800 <= i && i <= 0xdfff) {
  68. r = 0xfffd
  69. }
  70. if i <= 1<<16-1 {
  71. buf[0] = 0xe0 | u8(r>>12)
  72. buf[1] = 0x80 | u8(r>>6) & mask
  73. buf[2] = 0x80 | u8(r) & mask
  74. return buf, 3
  75. }
  76. buf[0] = 0xf0 | u8(r>>18)
  77. buf[1] = 0x80 | u8(r>>12) & mask
  78. buf[2] = 0x80 | u8(r>>6) & mask
  79. buf[3] = 0x80 | u8(r) & mask
  80. return buf, 4
  81. }
  82. decode_rune :: proc{
  83. decode_rune_in_string,
  84. decode_rune_in_bytes,
  85. }
  86. decode_rune_in_string :: #force_inline proc "contextless" (s: string) -> (rune, int) {
  87. return decode_rune_in_bytes(transmute([]u8)s)
  88. }
  89. decode_rune_in_bytes :: proc "contextless" (s: []u8) -> (rune, int) {
  90. n := len(s)
  91. if n < 1 {
  92. return RUNE_ERROR, 0
  93. }
  94. s0 := s[0]
  95. x := accept_sizes[s0]
  96. if x >= 0xF0 {
  97. mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff.
  98. return rune(s[0])&~mask | RUNE_ERROR&mask, 1
  99. }
  100. sz := x & 7
  101. accept := accept_ranges[x>>4]
  102. if n < int(sz) {
  103. return RUNE_ERROR, 1
  104. }
  105. b1 := s[1]
  106. if b1 < accept.lo || accept.hi < b1 {
  107. return RUNE_ERROR, 1
  108. }
  109. if sz == 2 {
  110. return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2
  111. }
  112. b2 := s[2]
  113. if b2 < LOCB || HICB < b2 {
  114. return RUNE_ERROR, 1
  115. }
  116. if sz == 3 {
  117. return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3
  118. }
  119. b3 := s[3]
  120. if b3 < LOCB || HICB < b3 {
  121. return RUNE_ERROR, 1
  122. }
  123. return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
  124. }
  125. string_to_runes :: proc "odin" (s: string, allocator := context.allocator) -> (runes: []rune) {
  126. n := rune_count_in_string(s)
  127. runes = make([]rune, n, allocator)
  128. i := 0
  129. for r in s {
  130. runes[i] = r
  131. i += 1
  132. }
  133. return
  134. }
  135. runes_to_string :: proc "odin" (runes: []rune, allocator := context.allocator) -> string {
  136. byte_count := 0
  137. for r in runes {
  138. _, w := encode_rune(r)
  139. byte_count += w
  140. }
  141. bytes := make([]byte, byte_count, allocator)
  142. offset := 0
  143. for r in runes {
  144. b, w := encode_rune(r)
  145. copy(bytes[offset:], b[:w])
  146. offset += w
  147. }
  148. return string(bytes)
  149. }
  150. decode_last_rune :: proc{
  151. decode_last_rune_in_string,
  152. decode_last_rune_in_bytes,
  153. }
  154. decode_last_rune_in_string :: #force_inline proc "contextless" (s: string) -> (rune, int) {
  155. return decode_last_rune_in_bytes(transmute([]u8)s)
  156. }
  157. decode_last_rune_in_bytes :: proc "contextless" (s: []u8) -> (rune, int) {
  158. r: rune
  159. size: int
  160. start, end, limit: int
  161. end = len(s)
  162. if end == 0 {
  163. return RUNE_ERROR, 0
  164. }
  165. start = end-1
  166. r = rune(s[start])
  167. if r < RUNE_SELF {
  168. return r, 1
  169. }
  170. limit = max(end - UTF_MAX, 0)
  171. for start-=1; start >= limit; start-=1 {
  172. if rune_start(s[start]) {
  173. break
  174. }
  175. }
  176. start = max(start, 0)
  177. r, size = decode_rune(s[start:end])
  178. if start+size != end {
  179. return RUNE_ERROR, 1
  180. }
  181. return r, size
  182. }
  183. rune_at_pos :: proc "contextless" (s: string, pos: int) -> rune {
  184. if pos < 0 {
  185. return RUNE_ERROR
  186. }
  187. i := 0
  188. for r in s {
  189. if i == pos {
  190. return r
  191. }
  192. i += 1
  193. }
  194. return RUNE_ERROR
  195. }
  196. rune_string_at_pos :: proc "contextless" (s: string, pos: int) -> string {
  197. if pos < 0 {
  198. return ""
  199. }
  200. i := 0
  201. for c, offset in s {
  202. if i == pos {
  203. w := rune_size(c)
  204. return s[offset:][:w]
  205. }
  206. i += 1
  207. }
  208. return ""
  209. }
  210. rune_at :: proc "contextless" (s: string, byte_index: int) -> rune {
  211. r, _ := decode_rune_in_string(s[byte_index:])
  212. return r
  213. }
  214. // Returns the byte position of rune at position pos in s with an optional start byte position.
  215. // Returns -1 if it runs out of the string.
  216. rune_offset :: proc "contextless" (s: string, pos: int, start: int = 0) -> int {
  217. if pos < 0 {
  218. return -1
  219. }
  220. i := 0
  221. for _, offset in s[start:] {
  222. if i == pos {
  223. return offset+start
  224. }
  225. i += 1
  226. }
  227. return -1
  228. }
  229. valid_rune :: proc "contextless" (r: rune) -> bool {
  230. if r < 0 {
  231. return false
  232. } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
  233. return false
  234. } else if r > MAX_RUNE {
  235. return false
  236. }
  237. return true
  238. }
  239. valid_string :: proc "contextless" (s: string) -> bool {
  240. n := len(s)
  241. for i := 0; i < n; {
  242. si := s[i]
  243. if si < RUNE_SELF { // ascii
  244. i += 1
  245. continue
  246. }
  247. x := accept_sizes[si]
  248. if x == 0xf1 {
  249. return false
  250. }
  251. size := int(x & 7)
  252. if i+size > n {
  253. return false
  254. }
  255. ar := accept_ranges[x>>4]
  256. if b := s[i+1]; b < ar.lo || ar.hi < b {
  257. return false
  258. } else if size == 2 {
  259. // Okay
  260. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  261. return false
  262. } else if size == 3 {
  263. // Okay
  264. } else if d := s[i+3]; b < 0x80 || 0xbf < d {
  265. return false
  266. }
  267. i += size
  268. }
  269. return true
  270. }
  271. rune_start :: #force_inline proc "contextless" (b: u8) -> bool {
  272. return b&0xc0 != 0x80
  273. }
  274. rune_count :: proc{
  275. rune_count_in_string,
  276. rune_count_in_bytes,
  277. }
  278. rune_count_in_string :: #force_inline proc(s: string) -> int {
  279. return rune_count_in_bytes(transmute([]u8)s)
  280. }
  281. rune_count_in_bytes :: proc "contextless" (s: []u8) -> int {
  282. count := 0
  283. n := len(s)
  284. for i := 0; i < n; {
  285. defer count += 1
  286. si := s[i]
  287. if si < RUNE_SELF { // ascii
  288. i += 1
  289. continue
  290. }
  291. x := accept_sizes[si]
  292. if x == 0xf1 {
  293. i += 1
  294. continue
  295. }
  296. size := int(x & 7)
  297. if i+size > n {
  298. i += 1
  299. continue
  300. }
  301. ar := accept_ranges[x>>4]
  302. if b := s[i+1]; b < ar.lo || ar.hi < b {
  303. size = 1
  304. } else if size == 2 {
  305. // Okay
  306. } else if c := s[i+2]; c < 0x80 || 0xbf < c {
  307. size = 1
  308. } else if size == 3 {
  309. // Okay
  310. } else if d := s[i+3]; d < 0x80 || 0xbf < d {
  311. size = 1
  312. }
  313. i += size
  314. }
  315. return count
  316. }
  317. rune_size :: proc "contextless" (r: rune) -> int {
  318. switch {
  319. case r < 0: return -1
  320. case r <= 1<<7 - 1: return 1
  321. case r <= 1<<11 - 1: return 2
  322. case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
  323. case r <= 1<<16 - 1: return 3
  324. case r <= MAX_RUNE: return 4
  325. }
  326. return -1
  327. }
  328. // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not
  329. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  330. full_rune :: proc{
  331. full_rune_in_bytes,
  332. full_rune_in_string,
  333. }
  334. // full_rune_in_bytes reports if the bytes in b begin with a full utf-8 encoding of a rune or not
  335. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  336. full_rune_in_bytes :: proc "contextless" (b: []byte) -> bool {
  337. n := len(b)
  338. if n == 0 {
  339. return false
  340. }
  341. x := accept_sizes[b[0]]
  342. if n >= int(x & 7) {
  343. return true
  344. }
  345. accept := accept_ranges[x>>4]
  346. if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) {
  347. return true
  348. } else if n > 2 && (b[2] < LOCB || HICB < b[2]) {
  349. return true
  350. }
  351. return false
  352. }
  353. // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not
  354. // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
  355. full_rune_in_string :: proc "contextless" (s: string) -> bool {
  356. return full_rune_in_bytes(transmute([]byte)s)
  357. }