123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- RUNE_ERROR :: '\ufffd'
- RUNE_SELF :: 0x80
- RUNE_BOM :: 0xfeff
- RUNE_EOF :: ~(0 as rune)
- MAX_RUNE :: '\U0010ffff'
- UTF_MAX :: 4
- SURROGATE_MIN :: 0xd800
- SURROGATE_MAX :: 0xdfff
- Accept_Range :: struct {
- lo, hi: u8
- }
- accept_ranges := [5]Accept_Range{
- {0x80, 0xbf},
- {0xa0, 0xbf},
- {0x80, 0x9f},
- {0x90, 0xbf},
- {0x80, 0x8f},
- }
- accept_sizes := [256]byte{
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
- 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
- 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
- 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
- }
- encode_rune :: proc(r_: rune) -> ([4]byte, int) {
- r := r_
- buf: [4]byte
- i := r as u32
- mask :: 0x3f as byte
- if i <= 1<<7-1 {
- buf[0] = r as byte
- return buf, 1
- }
- if i <= 1<<11-1 {
- buf[0] = 0xc0 | (r>>6) as byte
- buf[1] = 0x80 | (r) as byte & mask
- return buf, 2
- }
- // Invalid or Surrogate range
- if i > 0x0010ffff ||
- (0xd800 <= i && i <= 0xdfff) {
- r = 0xfffd
- }
- if i <= 1<<16-1 {
- buf[0] = 0xe0 | (r>>12) as byte
- buf[1] = 0x80 | (r>>6) as byte & mask
- buf[2] = 0x80 | (r) as byte & mask
- return buf, 3
- }
- buf[0] = 0xf0 | (r>>18) as byte
- buf[1] = 0x80 | (r>>12) as byte & mask
- buf[2] = 0x80 | (r>>6) as byte & mask
- buf[3] = 0x80 | (r) as byte & mask
- return buf, 4
- }
- decode_rune :: proc(s: string) -> (rune, int) {
- n := s.count
- if n < 1 {
- return RUNE_ERROR, 0
- }
- b0 := s[0]
- x := accept_sizes[b0]
- if x >= 0xf0 {
- mask := (x as rune << 31) >> 31; // all zeros or all ones
- return (b0 as rune) &~ mask | RUNE_ERROR&mask, 1
- }
- size := x & 7
- ar := accept_ranges[x>>4]
- if n < size as int {
- return RUNE_ERROR, 1
- }
- b1 := s[1]
- if b1 < ar.lo || ar.hi < b1 {
- return RUNE_ERROR, 1
- }
- MASK_X :: 0b00111111
- MASK_2 :: 0b00011111
- MASK_3 :: 0b00001111
- MASK_4 :: 0b00000111
- if size == 2 {
- return (b0&MASK_2) as rune <<6 | (b1&MASK_X) as rune, 2
- }
- b2 := s[2]
- if b2 < 0x80 || 0xbf < b2 {
- return RUNE_ERROR, 1
- }
- if size == 3 {
- return (b0&MASK_3) as rune <<12 | (b1&MASK_X) as rune <<6 | (b2&MASK_X) as rune, 3
- }
- b3 := s[3]
- if b3 < 0x80 || 0xbf < b3 {
- return RUNE_ERROR, 1
- }
- return (b0&MASK_4) as rune <<18 | (b1&MASK_X) as rune <<12 | (b3&MASK_X) as rune <<6 | (b3&MASK_X) as rune, 4
- }
- valid_rune :: proc(r: rune) -> bool {
- if r < 0 {
- return false
- } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
- return false
- } else if r > MAX_RUNE {
- return false
- }
- return true
- }
- valid_string :: proc(s: string) -> bool {
- n := s.count
- for i := 0; i < n; {
- si := s[i]
- if si < RUNE_SELF { // ascii
- i++
- continue
- }
- x := accept_sizes[si]
- if x == 0xf1 {
- return false
- }
- size := (x & 7) as int
- if i+size > n {
- return false
- }
- ar := accept_ranges[x>>4]
- if b := s[i+1]; b < ar.lo || ar.hi < b {
- return false
- } else if size == 2 {
- // Okay
- } else if b := s[i+2]; b < 0x80 || 0xbf < b {
- return false
- } else if size == 3 {
- // Okay
- } else if b := s[i+3]; b < 0x80 || 0xbf < b {
- return false
- }
- i += size
- }
- return true
- }
- rune_count :: proc(s: string) -> int {
- count := 0
- n := s.count
- for i := 0; i < n; count++ {
- si := s[i]
- if si < RUNE_SELF { // ascii
- i++
- continue
- }
- x := accept_sizes[si]
- if x == 0xf1 {
- i++
- continue
- }
- size := (x & 7) as int
- if i+size > n {
- i++
- continue
- }
- ar := accept_ranges[x>>4]
- if b := s[i+1]; b < ar.lo || ar.hi < b {
- size = 1
- } else if size == 2 {
- // Okay
- } else if b := s[i+2]; b < 0x80 || 0xbf < b {
- size = 1
- } else if size == 3 {
- // Okay
- } else if b := s[i+3]; b < 0x80 || 0xbf < b {
- size = 1
- }
- i += size
- }
- return count
- }
- rune_size :: proc(r: rune) -> int {
- match {
- case r < 0: return -1
- case r <= 1<<7 - 1: return 1
- case r <= 1<<11 - 1: return 2
- case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1
- case r <= 1<<16 - 1: return 3
- case r <= MAX_RUNE: return 4
- }
- return -1
- }
|