package utf8 RUNE_ERROR :: '\ufffd' RUNE_SELF :: 0x80 RUNE_BOM :: 0xfeff RUNE_EOF :: ~rune(0) MAX_RUNE :: '\U0010ffff' UTF_MAX :: 4 SURROGATE_MIN :: 0xd800 SURROGATE_MAX :: 0xdfff T1 :: 0b0000_0000 TX :: 0b1000_0000 T2 :: 0b1100_0000 T3 :: 0b1110_0000 T4 :: 0b1111_0000 T5 :: 0b1111_1000 MASKX :: 0b0011_1111 MASK2 :: 0b0001_1111 MASK3 :: 0b0000_1111 MASK4 :: 0b0000_0111 RUNE1_MAX :: 1<<7 - 1 RUNE2_MAX :: 1<<11 - 1 RUNE3_MAX :: 1<<16 - 1 // The default lowest and highest continuation byte. LOCB :: 0b1000_0000 HICB :: 0b1011_1111 Accept_Range :: struct {lo, hi: u8} accept_ranges := [5]Accept_Range{ {0x80, 0xbf}, {0xa0, 0xbf}, {0x80, 0x9f}, {0x90, 0xbf}, {0x80, 0x8f}, } accept_sizes := [256]u8{ 0x00..=0x7f = 0xf0, 0x80..=0xc1 = 0xf1, 0xc2..=0xdf = 0x02, 0xe0 = 0x13, 0xe1..=0xec = 0x03, 0xed = 0x23, 0xee..=0xef = 0x03, 0xf0 = 0x34, 0xf1..=0xf3 = 0x04, 0xf4 = 0x44, 0xf5..=0xff = 0xf1, } encode_rune :: proc(c: rune) -> ([4]u8, int) { r := c buf: [4]u8 i := u32(r) mask :: u8(0x3f) if i <= 1<<7-1 { buf[0] = u8(r) return buf, 1 } if i <= 1<<11-1 { buf[0] = 0xc0 | u8(r>>6) buf[1] = 0x80 | u8(r) & mask return buf, 2 } // Invalid or Surrogate range if i > 0x0010ffff || (0xd800 <= i && i <= 0xdfff) { r = 0xfffd } if i <= 1<<16-1 { buf[0] = 0xe0 | u8(r>>12) buf[1] = 0x80 | u8(r>>6) & mask buf[2] = 0x80 | u8(r) & mask return buf, 3 } buf[0] = 0xf0 | u8(r>>18) buf[1] = 0x80 | u8(r>>12) & mask buf[2] = 0x80 | u8(r>>6) & mask buf[3] = 0x80 | u8(r) & mask return buf, 4 } decode_rune_in_string :: #force_inline proc(s: string) -> (rune, int) { return decode_rune(transmute([]u8)s) } decode_rune :: proc(s: []u8) -> (rune, int) { n := len(s) if n < 1 { return RUNE_ERROR, 0 } s0 := s[0] x := accept_sizes[s0] if x >= 0xF0 { mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff. return rune(s[0])&~mask | RUNE_ERROR&mask, 1 } sz := x & 7 accept := accept_ranges[x>>4] if n < int(sz) { return RUNE_ERROR, 1 } b1 := s[1] if b1 < accept.lo || accept.hi < b1 { return RUNE_ERROR, 1 } if sz == 2 { return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2 } b2 := s[2] if b2 < LOCB || HICB < b2 { return RUNE_ERROR, 1 } if sz == 3 { return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3 } b3 := s[3] if b3 < LOCB || HICB < b3 { return RUNE_ERROR, 1 } return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4 } string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) { n := rune_count_in_string(s) runes = make([]rune, n, allocator) i := 0 for r in s { runes[i] = r i += 1 } return } runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string { byte_count := 0 for r in runes { _, w := encode_rune(r) byte_count += w } bytes := make([]byte, byte_count, allocator) offset := 0 for r in runes { b, w := encode_rune(r) copy(bytes[offset:], b[:w]) offset += w } return string(bytes) } decode_last_rune_in_string :: #force_inline proc(s: string) -> (rune, int) { return decode_last_rune(transmute([]u8)s) } decode_last_rune :: proc(s: []u8) -> (rune, int) { r: rune size: int start, end, limit: int end = len(s) if end == 0 { return RUNE_ERROR, 0 } start = end-1 r = rune(s[start]) if r < RUNE_SELF { return r, 1 } limit = max(end - UTF_MAX, 0) for start-=1; start >= limit; start-=1 { if rune_start(s[start]) { break } } start = max(start, 0) r, size = decode_rune(s[start:end]) if start+size != end { return RUNE_ERROR, 1 } return r, size } rune_at_pos :: proc(s: string, pos: int) -> rune { if pos < 0 { return RUNE_ERROR } i := 0 for r in s { if i == pos { return r } i += 1 } return RUNE_ERROR } rune_string_at_pos :: proc(s: string, pos: int) -> string { if pos < 0 { return "" } i := 0 for c, offset in s { if i == pos { w := rune_size(c) return s[offset:][:w] } i += 1 } return "" } rune_at :: proc(s: string, byte_index: int) -> rune { r, _ := decode_rune_in_string(s[byte_index:]) return r } // Returns the byte position of rune at position pos in s with an optional start byte position. // Returns -1 if it runs out of the string. rune_offset :: proc(s: string, pos: int, start: int = 0) -> int { if pos < 0 { return -1 } i := 0 for _, offset in s[start:] { if i == pos { return offset+start } i += 1 } return -1 } valid_rune :: proc(r: rune) -> bool { if r < 0 { return false } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX { return false } else if r > MAX_RUNE { return false } return true } valid_string :: proc(s: string) -> bool { n := len(s) for i := 0; i < n; { si := s[i] if si < RUNE_SELF { // ascii i += 1 continue } x := accept_sizes[si] if x == 0xf1 { return false } size := int(x & 7) if i+size > n { return false } ar := accept_ranges[x>>4] if b := s[i+1]; b < ar.lo || ar.hi < b { return false } else if size == 2 { // Okay } else if c := s[i+2]; c < 0x80 || 0xbf < c { return false } else if size == 3 { // Okay } else if d := s[i+3]; b < 0x80 || 0xbf < d { return false } i += size } return true } rune_start :: #force_inline proc(b: u8) -> bool { return b&0xc0 != 0x80 } rune_count_in_string :: #force_inline proc(s: string) -> int { return rune_count(transmute([]u8)s) } rune_count :: proc(s: []u8) -> int { count := 0 n := len(s) for i := 0; i < n; { defer count += 1 si := s[i] if si < RUNE_SELF { // ascii i += 1 continue } x := accept_sizes[si] if x == 0xf1 { i += 1 continue } size := int(x & 7) if i+size > n { i += 1 continue } ar := accept_ranges[x>>4] if b := s[i+1]; b < ar.lo || ar.hi < b { size = 1 } else if size == 2 { // Okay } else if c := s[i+2]; c < 0x80 || 0xbf < c { size = 1 } else if size == 3 { // Okay } else if d := s[i+3]; d < 0x80 || 0xbf < d { size = 1 } i += size } return count } rune_size :: proc(r: rune) -> int { switch { case r < 0: return -1 case r <= 1<<7 - 1: return 1 case r <= 1<<11 - 1: return 2 case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1 case r <= 1<<16 - 1: return 3 case r <= MAX_RUNE: return 4 } return -1 } // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) full_rune :: proc(b: []byte) -> bool { n := len(b) if n == 0 { return false } x := _first[b[0]] if n >= int(x & 7) { return true } accept := accept_ranges[x>>4] if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) { return true } else if n > 2 && (b[2] < LOCB || HICB < b[2]) { return true } return false } // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) full_rune_in_string :: proc(s: string) -> bool { return full_rune(transmute([]byte)s) } _first := [256]u8{ 0x00..=0x7f = 0xf0, // ascii, size 1 0x80..=0xc1 = 0xf1, // invalid, size 1 0xc2..=0xdf = 0x02, // accept 1, size 2 0xe0 = 0x13, // accept 1, size 3 0xe1..=0xec = 0x03, // accept 0, size 3 0xed = 0x23, // accept 2, size 3 0xee..=0xef = 0x03, // accept 0, size 3 0xf0 = 0x34, // accept 3, size 4 0xf1..=0xf3 = 0x04, // accept 0, size 4 0xf4 = 0x44, // accept 4, size 4 0xf5..=0xff = 0xf1, // ascii, size 1 }