string.go 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. // Package unistring contains an implementation of a hybrid ASCII/UTF-16 string.
  2. // For ASCII strings the underlying representation is equivalent to a normal Go string.
  3. // For unicode strings the underlying representation is UTF-16 as []uint16 with 0th element set to 0xFEFF.
  4. // unicode.String allows representing malformed UTF-16 values (e.g. stand-alone parts of surrogate pairs)
  5. // which cannot be represented in UTF-8.
  6. // At the same time it is possible to use unicode.String as property keys just as efficiently as simple strings,
  7. // (the leading 0xFEFF ensures there is no clash with ASCII string), and it is possible to convert it
  8. // to valueString without extra allocations.
  9. package unistring
  10. import (
  11. "reflect"
  12. "unicode/utf16"
  13. "unicode/utf8"
  14. "unsafe"
  15. )
  16. const (
  17. BOM = 0xFEFF
  18. )
  19. type String string
  20. func NewFromString(s string) String {
  21. ascii := true
  22. size := 0
  23. for _, c := range s {
  24. if c >= utf8.RuneSelf {
  25. ascii = false
  26. if c > 0xFFFF {
  27. size++
  28. }
  29. }
  30. size++
  31. }
  32. if ascii {
  33. return String(s)
  34. }
  35. b := make([]uint16, size+1)
  36. b[0] = BOM
  37. i := 1
  38. for _, c := range s {
  39. if c <= 0xFFFF {
  40. b[i] = uint16(c)
  41. } else {
  42. first, second := utf16.EncodeRune(c)
  43. b[i] = uint16(first)
  44. i++
  45. b[i] = uint16(second)
  46. }
  47. i++
  48. }
  49. return FromUtf16(b)
  50. }
  51. func NewFromRunes(s []rune) String {
  52. ascii := true
  53. size := 0
  54. for _, c := range s {
  55. if c >= utf8.RuneSelf {
  56. ascii = false
  57. if c > 0xFFFF {
  58. size++
  59. }
  60. }
  61. size++
  62. }
  63. if ascii {
  64. return String(s)
  65. }
  66. b := make([]uint16, size+1)
  67. b[0] = BOM
  68. i := 1
  69. for _, c := range s {
  70. if c <= 0xFFFF {
  71. b[i] = uint16(c)
  72. } else {
  73. first, second := utf16.EncodeRune(c)
  74. b[i] = uint16(first)
  75. i++
  76. b[i] = uint16(second)
  77. }
  78. i++
  79. }
  80. return FromUtf16(b)
  81. }
  82. func FromUtf16(b []uint16) String {
  83. var str string
  84. hdr := (*reflect.StringHeader)(unsafe.Pointer(&str))
  85. hdr.Data = uintptr(unsafe.Pointer(&b[0]))
  86. hdr.Len = len(b) * 2
  87. return String(str)
  88. }
  89. func (s String) String() string {
  90. if b := s.AsUtf16(); b != nil {
  91. return string(utf16.Decode(b[1:]))
  92. }
  93. return string(s)
  94. }
  95. func (s String) AsUtf16() []uint16 {
  96. if len(s) < 4 || len(s)&1 != 0 {
  97. return nil
  98. }
  99. l := len(s) / 2
  100. raw := string(s)
  101. hdr := (*reflect.StringHeader)(unsafe.Pointer(&raw))
  102. a := *(*[]uint16)(unsafe.Pointer(&reflect.SliceHeader{
  103. Data: hdr.Data,
  104. Len: l,
  105. Cap: l,
  106. }))
  107. if a[0] == BOM {
  108. return a
  109. }
  110. return nil
  111. }