| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- // Package unistring contains an implementation of a hybrid ASCII/UTF-16 string.
- // For ASCII strings the underlying representation is equivalent to a normal Go string.
- // For unicode strings the underlying representation is UTF-16 as []uint16 with 0th element set to 0xFEFF.
- // unicode.String allows representing malformed UTF-16 values (e.g. stand-alone parts of surrogate pairs)
- // which cannot be represented in UTF-8.
- // At the same time it is possible to use unicode.String as property keys just as efficiently as simple strings,
- // (the leading 0xFEFF ensures there is no clash with ASCII string), and it is possible to convert it
- // to valueString without extra allocations.
- package unistring
- import (
- "reflect"
- "unicode/utf16"
- "unicode/utf8"
- "unsafe"
- )
- const (
- BOM = 0xFEFF
- )
- type String string
- // Scan checks if the string contains any unicode characters. If it does, converts to an array suitable for creating
- // a String using FromUtf16, otherwise returns nil.
- func Scan(s string) []uint16 {
- utf16Size := 0
- for ; utf16Size < len(s); utf16Size++ {
- if s[utf16Size] >= utf8.RuneSelf {
- goto unicode
- }
- }
- return nil
- unicode:
- for _, chr := range s[utf16Size:] {
- utf16Size++
- if chr > 0xFFFF {
- utf16Size++
- }
- }
- buf := make([]uint16, utf16Size+1)
- buf[0] = BOM
- c := 1
- for _, chr := range s {
- if chr <= 0xFFFF {
- buf[c] = uint16(chr)
- } else {
- first, second := utf16.EncodeRune(chr)
- buf[c] = uint16(first)
- c++
- buf[c] = uint16(second)
- }
- c++
- }
- return buf
- }
- func NewFromString(s string) String {
- if buf := Scan(s); buf != nil {
- return FromUtf16(buf)
- }
- return String(s)
- }
- func NewFromRunes(s []rune) String {
- ascii := true
- size := 0
- for _, c := range s {
- if c >= utf8.RuneSelf {
- ascii = false
- if c > 0xFFFF {
- size++
- }
- }
- size++
- }
- if ascii {
- return String(s)
- }
- b := make([]uint16, size+1)
- b[0] = BOM
- i := 1
- for _, c := range s {
- if c <= 0xFFFF {
- b[i] = uint16(c)
- } else {
- first, second := utf16.EncodeRune(c)
- b[i] = uint16(first)
- i++
- b[i] = uint16(second)
- }
- i++
- }
- return FromUtf16(b)
- }
- func FromUtf16(b []uint16) String {
- var str string
- hdr := (*reflect.StringHeader)(unsafe.Pointer(&str))
- hdr.Data = uintptr(unsafe.Pointer(&b[0]))
- hdr.Len = len(b) * 2
- return String(str)
- }
- func (s String) String() string {
- if b := s.AsUtf16(); b != nil {
- return string(utf16.Decode(b[1:]))
- }
- return string(s)
- }
- func (s String) AsUtf16() []uint16 {
- if len(s) < 4 || len(s)&1 != 0 {
- return nil
- }
- var a []uint16
- raw := string(s)
- sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&a))
- sliceHeader.Data = (*reflect.StringHeader)(unsafe.Pointer(&raw)).Data
- l := len(raw) / 2
- sliceHeader.Len = l
- sliceHeader.Cap = l
- if a[0] == BOM {
- return a
- }
- return nil
- }
|