string.odin 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. // A convenient and efficient way to index strings by `Unicode` code point (`rune`) rather than byte.
  2. package utf8string
  3. import "core:unicode/utf8"
  4. import "base:runtime"
  5. import "base:builtin"
  6. String :: struct {
  7. contents: string,
  8. rune_count: int,
  9. // cached information
  10. non_ascii: int, // index to non-ascii code points
  11. width: int, // 0 if ascii
  12. byte_pos: int,
  13. rune_pos: int,
  14. }
  15. @(private)
  16. _len :: builtin.len // helper procedure
  17. init :: proc(s: ^String, contents: string) -> ^String {
  18. s.contents = contents
  19. s.byte_pos = 0
  20. s.rune_pos = 0
  21. for i in 0..<_len(contents) {
  22. if contents[i] >= utf8.RUNE_SELF {
  23. s.rune_count = utf8.rune_count_in_string(contents)
  24. _, s.width = utf8.decode_rune_in_string(contents)
  25. s.non_ascii = i
  26. return s
  27. }
  28. }
  29. s.rune_count = _len(contents)
  30. s.width = 0
  31. s.non_ascii = _len(contents)
  32. return s
  33. }
  34. to_string :: proc(s: ^String) -> string {
  35. return s.contents
  36. }
  37. len :: proc(s: ^String) -> int {
  38. return s.rune_count
  39. }
  40. is_ascii :: proc(s: ^String) -> bool {
  41. return s.width == 0
  42. }
  43. at :: proc(s: ^String, i: int, loc := #caller_location) -> (r: rune) {
  44. runtime.bounds_check_error_loc(loc, i, s.rune_count)
  45. if i < s.non_ascii {
  46. return rune(s.contents[i])
  47. }
  48. switch i {
  49. case 0:
  50. r, s.width = utf8.decode_rune_in_string(s.contents)
  51. s.rune_pos = 0
  52. s.byte_pos = 0
  53. return
  54. case s.rune_count-1:
  55. r, s.width = utf8.decode_last_rune(s.contents)
  56. s.rune_pos = i
  57. s.byte_pos = _len(s.contents) - s.width
  58. return
  59. case s.rune_pos-1:
  60. r, s.width = utf8.decode_rune_in_string(s.contents[0:s.byte_pos])
  61. s.rune_pos = i
  62. s.byte_pos -= s.width
  63. return
  64. case s.rune_pos+1:
  65. s.rune_pos = i
  66. s.byte_pos += s.width
  67. fallthrough
  68. case s.rune_pos:
  69. r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:])
  70. return
  71. }
  72. // Linear scan
  73. scan_forward := true
  74. if i < s.rune_pos {
  75. if i < (s.rune_pos-s.non_ascii)/2 {
  76. s.byte_pos, s.rune_pos = s.non_ascii, s.non_ascii
  77. } else {
  78. scan_forward = false
  79. }
  80. } else if i-s.rune_pos < (s.rune_count-s.rune_pos)/2 {
  81. // scan_forward = true
  82. } else {
  83. s.byte_pos, s.rune_pos = _len(s.contents), s.rune_count
  84. scan_forward = false
  85. }
  86. if scan_forward {
  87. for {
  88. r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:])
  89. if s.rune_pos == i {
  90. return
  91. }
  92. s.rune_pos += 1
  93. s.byte_pos += s.width
  94. }
  95. } else {
  96. for {
  97. r, s.width = utf8.decode_last_rune_in_string(s.contents[:s.byte_pos])
  98. s.rune_pos -= 1
  99. s.byte_pos -= s.width
  100. if s.rune_pos == i {
  101. return
  102. }
  103. }
  104. }
  105. }
  106. slice :: proc(s: ^String, i, j: int, loc := #caller_location) -> string {
  107. runtime.slice_expr_error_lo_hi_loc(loc, i, j, s.rune_count)
  108. if j < s.non_ascii {
  109. return s.contents[i:j]
  110. }
  111. if i == j {
  112. return ""
  113. }
  114. lo, hi: int
  115. if i < s.non_ascii {
  116. lo = i
  117. } else if i == s.rune_count {
  118. lo = _len(s.contents)
  119. } else {
  120. at(s, i, loc)
  121. lo = s.byte_pos
  122. }
  123. if j == s.rune_count {
  124. hi = _len(s.contents)
  125. } else {
  126. at(s, j, loc)
  127. hi = s.byte_pos
  128. }
  129. return s.contents[lo:hi]
  130. }