scanner.odin 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. package bufio
  2. import "core:bytes"
  3. import "core:io"
  4. import "core:mem"
  5. import "core:unicode/utf8"
  6. import "core:intrinsics"
  7. // Extra errors returns by scanning procedures
  8. Scanner_Extra_Error :: enum i32 {
  9. None,
  10. Negative_Advance,
  11. Advanced_Too_Far,
  12. Bad_Read_Count,
  13. Too_Long,
  14. Too_Short,
  15. }
  16. Scanner_Error :: union #shared_nil {
  17. io.Error,
  18. Scanner_Extra_Error,
  19. }
  20. // Split_Proc is the signature of the split procedure used to tokenize the input.
  21. Split_Proc :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool)
  22. Scanner :: struct {
  23. r: io.Reader,
  24. split: Split_Proc,
  25. buf: [dynamic]byte,
  26. max_token_size: int,
  27. start: int,
  28. end: int,
  29. token: []byte,
  30. _err: Scanner_Error,
  31. max_consecutive_empty_reads: int,
  32. successive_empty_token_count: int,
  33. scan_called: bool,
  34. done: bool,
  35. }
  36. DEFAULT_MAX_SCAN_TOKEN_SIZE :: 1<<16
  37. @(private)
  38. _INIT_BUF_SIZE :: 4096
  39. scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocator) -> ^Scanner {
  40. s.r = r
  41. s.split = scan_lines
  42. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  43. s.buf.allocator = buf_allocator
  44. return s
  45. }
  46. scanner_init_with_buffer :: proc(s: ^Scanner, r: io.Reader, buf: []byte) -> ^Scanner {
  47. s.r = r
  48. s.split = scan_lines
  49. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  50. s.buf = mem.buffer_from_slice(buf)
  51. resize(&s.buf, cap(s.buf))
  52. return s
  53. }
  54. scanner_destroy :: proc(s: ^Scanner) {
  55. delete(s.buf)
  56. }
  57. // Returns the first non-EOF error that was encounted by the scanner
  58. scanner_error :: proc(s: ^Scanner) -> Scanner_Error {
  59. switch s._err {
  60. case .EOF, nil:
  61. return nil
  62. }
  63. return s._err
  64. }
  65. // Returns the most recent token created by scanner_scan.
  66. // The underlying array may point to data that may be overwritten
  67. // by another call to scanner_scan.
  68. // Treat the returned value as if it is immutable.
  69. scanner_bytes :: proc(s: ^Scanner) -> []byte {
  70. return s.token
  71. }
  72. // Returns the most recent token created by scanner_scan.
  73. // The underlying array may point to data that may be overwritten
  74. // by another call to scanner_scan.
  75. // Treat the returned value as if it is immutable.
  76. scanner_text :: proc(s: ^Scanner) -> string {
  77. return string(s.token)
  78. }
  79. // scanner_scan advances the scanner
  80. scanner_scan :: proc(s: ^Scanner) -> bool {
  81. set_err :: proc(s: ^Scanner, err: Scanner_Error) {
  82. switch s._err {
  83. case nil, .EOF:
  84. s._err = err
  85. }
  86. }
  87. if s.done {
  88. return false
  89. }
  90. s.scan_called = true
  91. for {
  92. // Check if a token is possible with what is available
  93. // Allow the split procedure to recover if it fails
  94. if s.start < s.end || s._err != nil {
  95. advance, token, err, final_token := s.split(s.buf[s.start:s.end], s._err != nil)
  96. if final_token {
  97. s.token = token
  98. s.done = true
  99. return true
  100. }
  101. if err != nil {
  102. set_err(s, err)
  103. return false
  104. }
  105. // Do advance
  106. if advance < 0 {
  107. set_err(s, .Negative_Advance)
  108. return false
  109. }
  110. if advance > s.end-s.start {
  111. set_err(s, .Advanced_Too_Far)
  112. return false
  113. }
  114. s.start += advance
  115. s.token = token
  116. if s.token != nil {
  117. if s._err == nil || advance > 0 {
  118. s.successive_empty_token_count = 0
  119. } else {
  120. s.successive_empty_token_count += 1
  121. if s.max_consecutive_empty_reads <= 0 {
  122. s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS
  123. }
  124. if s.successive_empty_token_count > s.max_consecutive_empty_reads {
  125. set_err(s, .No_Progress)
  126. return false
  127. }
  128. }
  129. return true
  130. }
  131. }
  132. // If an error is hit, no token can be created
  133. if s._err != nil {
  134. s.start = 0
  135. s.end = 0
  136. return false
  137. }
  138. // More data must be required to be read
  139. if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
  140. copy(s.buf[:], s.buf[s.start:s.end])
  141. s.end -= s.start
  142. s.start = 0
  143. }
  144. could_be_too_short := false
  145. // Resize the buffer if full
  146. if s.end == len(s.buf) {
  147. if s.max_token_size <= 0 {
  148. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  149. }
  150. if len(s.buf) >= s.max_token_size {
  151. set_err(s, .Too_Long)
  152. return false
  153. }
  154. // overflow check
  155. new_size := _INIT_BUF_SIZE
  156. if len(s.buf) > 0 {
  157. overflowed: bool
  158. if new_size, overflowed = intrinsics.overflow_mul(len(s.buf), 2); overflowed {
  159. set_err(s, .Too_Long)
  160. return false
  161. }
  162. }
  163. old_size := len(s.buf)
  164. new_size = min(new_size, s.max_token_size)
  165. resize(&s.buf, new_size)
  166. s.end -= s.start
  167. s.start = 0
  168. could_be_too_short = old_size >= len(s.buf)
  169. }
  170. // Read data into the buffer
  171. loop := 0
  172. for {
  173. n, err := io.read(s.r, s.buf[s.end:len(s.buf)])
  174. if n < 0 || len(s.buf)-s.end < n {
  175. set_err(s, .Bad_Read_Count)
  176. break
  177. }
  178. s.end += n
  179. if err != nil {
  180. set_err(s, err)
  181. break
  182. }
  183. if n > 0 {
  184. s.successive_empty_token_count = 0
  185. break
  186. }
  187. loop += 1
  188. if s.max_consecutive_empty_reads <= 0 {
  189. s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS
  190. }
  191. if loop > s.max_consecutive_empty_reads {
  192. if could_be_too_short {
  193. set_err(s, .Too_Short)
  194. } else {
  195. set_err(s, .No_Progress)
  196. }
  197. break
  198. }
  199. }
  200. }
  201. }
  202. scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  203. if at_eof && len(data) == 0 {
  204. return
  205. }
  206. return 1, data[0:1], nil, false
  207. }
  208. scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  209. if at_eof && len(data) == 0 {
  210. return
  211. }
  212. if data[0] < utf8.RUNE_SELF {
  213. advance = 1
  214. token = data[0:1]
  215. return
  216. }
  217. _, width := utf8.decode_rune(data)
  218. if width > 1 {
  219. advance = width
  220. token = data[0:width]
  221. return
  222. }
  223. if !at_eof && !utf8.full_rune(data) {
  224. return
  225. }
  226. @thread_local ERROR_RUNE := []byte{0xef, 0xbf, 0xbd}
  227. advance = 1
  228. token = ERROR_RUNE
  229. return
  230. }
  231. scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  232. is_space :: proc "contextless" (r: rune) -> bool {
  233. switch r {
  234. // lower ones
  235. case ' ', '\t', '\n', '\v', '\f', '\r':
  236. return true
  237. case '\u0085', '\u00a0':
  238. return true
  239. // higher ones
  240. case '\u2000' ..= '\u200a':
  241. return true
  242. case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
  243. return true
  244. }
  245. return false
  246. }
  247. // skip spaces at the beginning
  248. start := 0
  249. for width := 0; start < len(data); start += width {
  250. r: rune
  251. r, width = utf8.decode_rune(data[start:])
  252. if !is_space(r) {
  253. break
  254. }
  255. }
  256. for width, i := 0, start; i < len(data); i += width {
  257. r: rune
  258. r, width = utf8.decode_rune(data[i:])
  259. if is_space(r) {
  260. advance = i+width
  261. token = data[start:i]
  262. return
  263. }
  264. }
  265. if at_eof && len(data) > start {
  266. advance = len(data)
  267. token = data[start:]
  268. return
  269. }
  270. advance = start
  271. return
  272. }
  273. scan_lines :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  274. trim_carriage_return :: proc "contextless" (data: []byte) -> []byte {
  275. if len(data) > 0 && data[len(data)-1] == '\r' {
  276. return data[0:len(data)-1]
  277. }
  278. return data
  279. }
  280. if at_eof && len(data) == 0 {
  281. return
  282. }
  283. if i := bytes.index_byte(data, '\n'); i >= 0 {
  284. advance = i+1
  285. token = trim_carriage_return(data[0:i])
  286. return
  287. }
  288. if at_eof {
  289. advance = len(data)
  290. token = trim_carriage_return(data)
  291. }
  292. return
  293. }