string.go 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. // Package unistring contains an implementation of a hybrid ASCII/UTF-16 string.
  2. // For ASCII strings the underlying representation is equivalent to a normal Go string.
  3. // For unicode strings the underlying representation is UTF-16 as []uint16 with 0th element set to 0xFEFF.
  4. // unicode.String allows representing malformed UTF-16 values (e.g. stand-alone parts of surrogate pairs)
  5. // which cannot be represented in UTF-8.
  6. // At the same time it is possible to use unicode.String as property keys just as efficiently as simple strings,
  7. // (the leading 0xFEFF ensures there is no clash with ASCII string), and it is possible to convert it
  8. // to valueString without extra allocations.
  9. package unistring
  10. import (
  11. "reflect"
  12. "unicode/utf16"
  13. "unicode/utf8"
  14. "unsafe"
  15. )
  16. const (
  17. BOM = 0xFEFF
  18. )
  19. type String string
  20. // Scan checks if the string contains any unicode characters. If it does, converts to an array suitable for creating
  21. // a String using FromUtf16, otherwise returns nil.
  22. func Scan(s string) []uint16 {
  23. utf16Size := 0
  24. for ; utf16Size < len(s); utf16Size++ {
  25. if s[utf16Size] >= utf8.RuneSelf {
  26. goto unicode
  27. }
  28. }
  29. return nil
  30. unicode:
  31. for _, chr := range s[utf16Size:] {
  32. utf16Size++
  33. if chr > 0xFFFF {
  34. utf16Size++
  35. }
  36. }
  37. buf := make([]uint16, utf16Size+1)
  38. buf[0] = BOM
  39. c := 1
  40. for _, chr := range s {
  41. if chr <= 0xFFFF {
  42. buf[c] = uint16(chr)
  43. } else {
  44. first, second := utf16.EncodeRune(chr)
  45. buf[c] = uint16(first)
  46. c++
  47. buf[c] = uint16(second)
  48. }
  49. c++
  50. }
  51. return buf
  52. }
  53. func NewFromString(s string) String {
  54. if buf := Scan(s); buf != nil {
  55. return FromUtf16(buf)
  56. }
  57. return String(s)
  58. }
  59. func NewFromRunes(s []rune) String {
  60. ascii := true
  61. size := 0
  62. for _, c := range s {
  63. if c >= utf8.RuneSelf {
  64. ascii = false
  65. if c > 0xFFFF {
  66. size++
  67. }
  68. }
  69. size++
  70. }
  71. if ascii {
  72. return String(s)
  73. }
  74. b := make([]uint16, size+1)
  75. b[0] = BOM
  76. i := 1
  77. for _, c := range s {
  78. if c <= 0xFFFF {
  79. b[i] = uint16(c)
  80. } else {
  81. first, second := utf16.EncodeRune(c)
  82. b[i] = uint16(first)
  83. i++
  84. b[i] = uint16(second)
  85. }
  86. i++
  87. }
  88. return FromUtf16(b)
  89. }
  90. func FromUtf16(b []uint16) String {
  91. var str string
  92. hdr := (*reflect.StringHeader)(unsafe.Pointer(&str))
  93. hdr.Data = uintptr(unsafe.Pointer(&b[0]))
  94. hdr.Len = len(b) * 2
  95. return String(str)
  96. }
  97. func (s String) String() string {
  98. if b := s.AsUtf16(); b != nil {
  99. return string(utf16.Decode(b[1:]))
  100. }
  101. return string(s)
  102. }
  103. func (s String) AsUtf16() []uint16 {
  104. if len(s) < 4 || len(s)&1 != 0 {
  105. return nil
  106. }
  107. var a []uint16
  108. raw := string(s)
  109. sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&a))
  110. sliceHeader.Data = (*reflect.StringHeader)(unsafe.Pointer(&raw)).Data
  111. l := len(raw) / 2
  112. sliceHeader.Len = l
  113. sliceHeader.Cap = l
  114. if a[0] == BOM {
  115. return a
  116. }
  117. return nil
  118. }