123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- package utf8
- RUNE_ERROR :: '\ufffd'
- RUNE_SELF :: 0x80
- RUNE_BOM :: 0xfeff
- RUNE_EOF :: ~rune(0)
- MAX_RUNE :: '\U0010ffff'
- UTF_MAX :: 4
- SURROGATE_MIN :: 0xd800
- SURROGATE_MAX :: 0xdfff
- T1 :: 0b0000_0000
- TX :: 0b1000_0000
- T2 :: 0b1100_0000
- T3 :: 0b1110_0000
- T4 :: 0b1111_0000
- T5 :: 0b1111_1000
- MASKX :: 0b0011_1111
- MASK2 :: 0b0001_1111
- MASK3 :: 0b0000_1111
- MASK4 :: 0b0000_0111
- RUNE1_MAX :: 1<<7 - 1
- RUNE2_MAX :: 1<<11 - 1
- RUNE3_MAX :: 1<<16 - 1
- // The default lowest and highest continuation byte.
- LOCB :: 0b1000_0000
- HICB :: 0b1011_1111
- Accept_Range :: struct {lo, hi: u8}
- accept_ranges := [5]Accept_Range{
- {0x80, 0xbf},
- {0xa0, 0xbf},
- {0x80, 0x9f},
- {0x90, 0xbf},
- {0x80, 0x8f},
- }
- accept_sizes := [256]u8{
- 0x00..=0x7f = 0xf0,
- 0x80..=0xc1 = 0xf1,
- 0xc2..=0xdf = 0x02,
- 0xe0 = 0x13,
- 0xe1..=0xec = 0x03,
- 0xed = 0x23,
- 0xee..=0xef = 0x03,
- 0xf0 = 0x34,
- 0xf1..=0xf3 = 0x04,
- 0xf4 = 0x44,
- 0xf5..=0xff = 0xf1,
- }
- encode_rune :: proc(c: rune) -> ([4]u8, int) {
- r := c
- buf: [4]u8
- i := u32(r)
- mask :: u8(0x3f)
- if i <= 1<<7-1 {
- buf[0] = u8(r)
- return buf, 1
- }
- if i <= 1<<11-1 {
- buf[0] = 0xc0 | u8(r>>6)
- buf[1] = 0x80 | u8(r) & mask
- return buf, 2
- }
- // Invalid or Surrogate range
- if i > 0x0010ffff ||
- (0xd800 <= i && i <= 0xdfff) {
- r = 0xfffd
- }
- if i <= 1<<16-1 {
- buf[0] = 0xe0 | u8(r>>12)
- buf[1] = 0x80 | u8(r>>6) & mask
- buf[2] = 0x80 | u8(r) & mask
- return buf, 3
- }
- buf[0] = 0xf0 | u8(r>>18)
- buf[1] = 0x80 | u8(r>>12) & mask
- buf[2] = 0x80 | u8(r>>6) & mask
- buf[3] = 0x80 | u8(r) & mask
- return buf, 4
- }
- decode_rune :: proc{
- decode_rune_in_string,
- decode_rune_in_bytes,
- }
- decode_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
- return decode_rune_in_bytes(transmute([]u8)s)
- }
- decode_rune_in_bytes :: proc(s: []u8) -> (rune, int) {
- n := len(s)
- if n < 1 {
- return RUNE_ERROR, 0
- }
- s0 := s[0]
- x := accept_sizes[s0]
- if x >= 0xF0 {
- mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff.
- return rune(s[0])&~mask | RUNE_ERROR&mask, 1
- }
- sz := x & 7
- accept := accept_ranges[x>>4]
- if n < int(sz) {
- return RUNE_ERROR, 1
- }
- b1 := s[1]
- if b1 < accept.lo || accept.hi < b1 {
- return RUNE_ERROR, 1
- }
- if sz == 2 {
- return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2
- }
- b2 := s[2]
- if b2 < LOCB || HICB < b2 {
- return RUNE_ERROR, 1
- }
- if sz == 3 {
- return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3
- }
- b3 := s[3]
- if b3 < LOCB || HICB < b3 {
- return RUNE_ERROR, 1
- }
- return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
- }
- string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) {
- n := rune_count_in_string(s)
- runes = make([]rune, n, allocator)
- i := 0
- for r in s {
- runes[i] = r
- i += 1
- }
- return
- }
- runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string {
- byte_count := 0
- for r in runes {
- _, w := encode_rune(r)
- byte_count += w
- }
- bytes := make([]byte, byte_count, allocator)
- offset := 0
- for r in runes {
- b, w := encode_rune(r)
- copy(bytes[offset:], b[:w])
- offset += w
- }
- return string(bytes)
- }
- decode_last_rune :: proc{
- decode_last_rune_in_string,
- decode_last_rune_in_bytes,
- }
- decode_last_rune_in_string :: #force_inline proc(s: string) -> (rune, int) {
- return decode_last_rune_in_bytes(transmute([]u8)s)
- }
- decode_last_rune_in_bytes :: proc(s: []u8) -> (rune, int) {
- r: rune
- size: int
- start, end, limit: int
- end = len(s)
- if end == 0 {
- return RUNE_ERROR, 0
- }
- start = end-1
- r = rune(s[start])
- if r < RUNE_SELF {
- return r, 1
- }
- limit = max(end - UTF_MAX, 0)
- for start-=1; start >= limit; start-=1 {
- if rune_start(s[start]) {
- break
- }
- }
- start = max(start, 0)
- r, size = decode_rune(s[start:end])
- if start+size != end {
- return RUNE_ERROR, 1
- }
- return r, size
- }
- rune_at_pos :: proc(s: string, pos: int) -> rune {
- if pos < 0 {
- return RUNE_ERROR
- }
- i := 0
- for r in s {
- if i == pos {
- return r
- }
- i += 1
- }
- return RUNE_ERROR
- }
- rune_string_at_pos :: proc(s: string, pos: int) -> string {
- if pos < 0 {
- return ""
- }
- i := 0
- for c, offset in s {
- if i == pos {
- w := rune_size(c)
- return s[offset:][:w]
- }
- i += 1
- }
- return ""
- }
- rune_at :: proc(s: string, byte_index: int) -> rune {
- r, _ := decode_rune_in_string(s[byte_index:])
- return r
- }
- // Returns the byte position of rune at position pos in s with an optional start byte position.
- // Returns -1 if it runs out of the string.
- rune_offset :: proc(s: string, pos: int, start: int = 0) -> int {
- if pos < 0 {
- return -1
- }
- i := 0
- for _, offset in s[start:] {
- if i == pos {
- return offset+start
- }
- i += 1
- }
- return -1
- }
- valid_rune :: proc(r: rune) -> bool {
- if r < 0 {
- return false
- } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
- return false
- } else if r > MAX_RUNE {
- return false
- }
- return true
- }
- valid_string :: proc(s: string) -> bool {
- n := len(s)
- for i := 0; i < n; {
- si := s[i]
- if si < RUNE_SELF { // ascii
- i += 1
- continue
- }
- x := accept_sizes[si]
- if x == 0xf1 {
- return false
- }
- size := int(x & 7)
- if i+size > n {
- return false
- }
- ar := accept_ranges[x>>4]
- if b := s[i+1]; b < ar.lo || ar.hi < b {
- return false
- } else if size == 2 {
- // Okay
- } else if c := s[i+2]; c < 0x80 || 0xbf < c {
- return false
- } else if size == 3 {
- // Okay
- } else if d := s[i+3]; b < 0x80 || 0xbf < d {
- return false
- }
- i += size
- }
- return true
- }
- rune_start :: #force_inline proc(b: u8) -> bool {
- return b&0xc0 != 0x80
- }
- rune_count :: proc{
- rune_count_in_string,
- rune_count_in_bytes,
- }
- rune_count_in_string :: #force_inline proc(s: string) -> int {
- return rune_count_in_bytes(transmute([]u8)s)
- }
- rune_count_in_bytes :: proc(s: []u8) -> int {
- count := 0
- n := len(s)
- for i := 0; i < n; {
- defer count += 1
- si := s[i]
- if si < RUNE_SELF { // ascii
- i += 1
- continue
- }
- x := accept_sizes[si]
- if x == 0xf1 {
- i += 1
- continue
- }
- size := int(x & 7)
- if i+size > n {
- i += 1
- continue
- }
- ar := accept_ranges[x>>4]
- if b := s[i+1]; b < ar.lo || ar.hi < b {
- size = 1
- } else if size == 2 {
- // Okay
- } else if c := s[i+2]; c < 0x80 || 0xbf < c {
- size = 1
- } else if size == 3 {
- // Okay
- } else if d := s[i+3]; d < 0x80 || 0xbf < d {
- size = 1
- }
- i += size
- }
- return count
- }
- rune_size :: proc(r: rune) -> int {
- switch {
- case r < 0: return -1
- case r <= 1<<7 - 1: return 1
- case r <= 1<<11 - 1: return 2
- case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
- case r <= 1<<16 - 1: return 3
- case r <= MAX_RUNE: return 4
- }
- return -1
- }
- // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not
- // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
- full_rune :: proc{
- full_rune_in_bytes,
- full_rune_in_string,
- }
- // full_rune_in_bytes reports if the bytes in b begin with a full utf-8 encoding of a rune or not
- // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
- full_rune_in_bytes :: proc(b: []byte) -> bool {
- n := len(b)
- if n == 0 {
- return false
- }
- x := _first[b[0]]
- if n >= int(x & 7) {
- return true
- }
- accept := accept_ranges[x>>4]
- if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) {
- return true
- } else if n > 2 && (b[2] < LOCB || HICB < b[2]) {
- return true
- }
- return false
- }
- // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not
- // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
- full_rune_in_string :: proc(s: string) -> bool {
- return full_rune_in_bytes(transmute([]byte)s)
- }
- _first := [256]u8{
- 0x00..=0x7f = 0xf0, // ascii, size 1
- 0x80..=0xc1 = 0xf1, // invalid, size 1
- 0xc2..=0xdf = 0x02, // accept 1, size 2
- 0xe0 = 0x13, // accept 1, size 3
- 0xe1..=0xec = 0x03, // accept 0, size 3
- 0xed = 0x23, // accept 2, size 3
- 0xee..=0xef = 0x03, // accept 0, size 3
- 0xf0 = 0x34, // accept 3, size 4
- 0xf1..=0xf3 = 0x04, // accept 0, size 4
- 0xf4 = 0x44, // accept 4, size 4
- 0xf5..=0xff = 0xf1, // ascii, size 1
- }
|