scanner.odin 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. package bufio
  2. import "core:bytes"
  3. import "core:io"
  4. import "core:mem"
  5. import "core:unicode/utf8"
  6. import "base:intrinsics"
  7. // Extra errors returns by scanning procedures
  8. Scanner_Extra_Error :: enum i32 {
  9. None,
  10. Negative_Advance,
  11. Advanced_Too_Far,
  12. Bad_Read_Count,
  13. Too_Long,
  14. Too_Short,
  15. }
  16. Scanner_Error :: union #shared_nil {
  17. io.Error,
  18. Scanner_Extra_Error,
  19. }
  20. // Split_Proc is the signature of the split procedure used to tokenize the input.
  21. Split_Proc :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool)
  22. Scanner :: struct {
  23. r: io.Reader,
  24. split: Split_Proc,
  25. buf: [dynamic]byte,
  26. max_token_size: int,
  27. start: int,
  28. end: int,
  29. token: []byte,
  30. _err: Scanner_Error,
  31. max_consecutive_empty_reads: int,
  32. successive_empty_token_count: int,
  33. scan_called: bool,
  34. done: bool,
  35. }
  36. DEFAULT_MAX_SCAN_TOKEN_SIZE :: 1<<16
  37. @(private)
  38. _INIT_BUF_SIZE :: 4096
  39. // Initializes a Scanner buffer an allocator `buf_allocator`
  40. scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocator) -> ^Scanner {
  41. s.r = r
  42. s.split = scan_lines
  43. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  44. s.buf.allocator = buf_allocator
  45. return s
  46. }
  47. // Initializes a Scanner buffer a user provided bytes buffer `buf`
  48. scanner_init_with_buffer :: proc(s: ^Scanner, r: io.Reader, buf: []byte) -> ^Scanner {
  49. s.r = r
  50. s.split = scan_lines
  51. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  52. s.buf = mem.buffer_from_slice(buf)
  53. resize(&s.buf, cap(s.buf))
  54. return s
  55. }
  56. scanner_destroy :: proc(s: ^Scanner) {
  57. delete(s.buf)
  58. }
  59. // Returns the first non-EOF error that was encountered by the scanner
  60. scanner_error :: proc(s: ^Scanner) -> Scanner_Error {
  61. switch s._err {
  62. case .EOF, nil:
  63. return nil
  64. }
  65. return s._err
  66. }
  67. // Returns the most recent token created by 'scan'.
  68. // The underlying array may point to data that may be overwritten
  69. // by another call to 'scan'.
  70. // Treat the returned value as if it is immutable.
  71. scanner_bytes :: proc(s: ^Scanner) -> []byte {
  72. return s.token
  73. }
  74. // Returns the most recent token created by 'scan'.
  75. // The underlying array may point to data that may be overwritten
  76. // by another call to 'scan'.
  77. // Treat the returned value as if it is immutable.
  78. scanner_text :: proc(s: ^Scanner) -> string {
  79. return string(s.token)
  80. }
  81. // scanner_scan is an alias of scan
  82. scanner_scan :: scan
  83. // scan advances the Scanner
  84. scan :: proc(s: ^Scanner) -> bool {
  85. set_err :: proc(s: ^Scanner, err: Scanner_Error) {
  86. switch s._err {
  87. case nil, .EOF:
  88. s._err = err
  89. }
  90. }
  91. if s.done {
  92. return false
  93. }
  94. s.scan_called = true
  95. for {
  96. // Check if a token is possible with what is available
  97. // Allow the split procedure to recover if it fails
  98. if s.start < s.end || s._err != nil {
  99. advance, token, err, final_token := s.split(s.buf[s.start:s.end], s._err != nil)
  100. if final_token {
  101. s.token = token
  102. s.done = true
  103. return true
  104. }
  105. if err != nil {
  106. set_err(s, err)
  107. return false
  108. }
  109. // Do advance
  110. if advance < 0 {
  111. set_err(s, .Negative_Advance)
  112. return false
  113. }
  114. if advance > s.end-s.start {
  115. set_err(s, .Advanced_Too_Far)
  116. return false
  117. }
  118. s.start += advance
  119. s.token = token
  120. if s.token != nil {
  121. if s._err == nil || advance > 0 {
  122. s.successive_empty_token_count = 0
  123. } else {
  124. s.successive_empty_token_count += 1
  125. if s.max_consecutive_empty_reads <= 0 {
  126. s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS
  127. }
  128. if s.successive_empty_token_count > s.max_consecutive_empty_reads {
  129. set_err(s, .No_Progress)
  130. return false
  131. }
  132. }
  133. return true
  134. }
  135. }
  136. // If an error is hit, no token can be created
  137. if s._err != nil {
  138. s.start = 0
  139. s.end = 0
  140. return false
  141. }
  142. // More data must be required to be read
  143. if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
  144. copy(s.buf[:], s.buf[s.start:s.end])
  145. s.end -= s.start
  146. s.start = 0
  147. }
  148. could_be_too_short := false
  149. // Resize the buffer if full
  150. if s.end == len(s.buf) {
  151. if s.max_token_size <= 0 {
  152. s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE
  153. }
  154. if len(s.buf) >= s.max_token_size {
  155. set_err(s, .Too_Long)
  156. return false
  157. }
  158. // overflow check
  159. new_size := _INIT_BUF_SIZE
  160. if len(s.buf) > 0 {
  161. overflowed: bool
  162. if new_size, overflowed = intrinsics.overflow_mul(len(s.buf), 2); overflowed {
  163. set_err(s, .Too_Long)
  164. return false
  165. }
  166. }
  167. old_size := len(s.buf)
  168. new_size = min(new_size, s.max_token_size)
  169. resize(&s.buf, new_size)
  170. s.end -= s.start
  171. s.start = 0
  172. could_be_too_short = old_size >= len(s.buf)
  173. }
  174. // Read data into the buffer
  175. loop := 0
  176. for {
  177. n, err := io.read(s.r, s.buf[s.end:len(s.buf)])
  178. if n < 0 || len(s.buf)-s.end < n {
  179. set_err(s, .Bad_Read_Count)
  180. break
  181. }
  182. s.end += n
  183. if err != nil {
  184. set_err(s, err)
  185. break
  186. }
  187. if n > 0 {
  188. s.successive_empty_token_count = 0
  189. break
  190. }
  191. loop += 1
  192. if s.max_consecutive_empty_reads <= 0 {
  193. s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS
  194. }
  195. if loop > s.max_consecutive_empty_reads {
  196. if could_be_too_short {
  197. set_err(s, .Too_Short)
  198. } else {
  199. set_err(s, .No_Progress)
  200. }
  201. break
  202. }
  203. }
  204. }
  205. }
  206. // scan_bytes is a splitting procedure that returns each byte as a token
  207. scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  208. if at_eof && len(data) == 0 {
  209. return
  210. }
  211. return 1, data[0:1], nil, false
  212. }
  213. // scan_runes is a splitting procedure that returns each UTF-8 encoded rune as a token.
  214. // The lsit of runes return is equivalent to that of iterating over a string in a 'for in' loop, meaning any
  215. // erroneous UTF-8 encodings will be returned as U+FFFD. Unfortunately this means it is impossible for the "client"
  216. // to know whether a U+FFFD is an expected replacement rune or an encoding of an error.
  217. scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  218. if at_eof && len(data) == 0 {
  219. return
  220. }
  221. if data[0] < utf8.RUNE_SELF {
  222. advance = 1
  223. token = data[0:1]
  224. return
  225. }
  226. _, width := utf8.decode_rune(data)
  227. if width > 1 {
  228. advance = width
  229. token = data[0:width]
  230. return
  231. }
  232. if !at_eof && !utf8.full_rune(data) {
  233. return
  234. }
  235. @thread_local ERROR_RUNE := []byte{0xef, 0xbf, 0xbd}
  236. advance = 1
  237. token = ERROR_RUNE
  238. return
  239. }
  240. // scan_words is a splitting procedure that returns each Unicode-space-separated word of text, excluding the surrounded spaces.
  241. // It will never return return an empty string.
  242. scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  243. is_space :: proc "contextless" (r: rune) -> bool {
  244. switch r {
  245. // lower ones
  246. case ' ', '\t', '\n', '\v', '\f', '\r':
  247. return true
  248. case '\u0085', '\u00a0':
  249. return true
  250. // higher ones
  251. case '\u2000' ..= '\u200a':
  252. return true
  253. case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
  254. return true
  255. }
  256. return false
  257. }
  258. // skip spaces at the beginning
  259. start := 0
  260. for width := 0; start < len(data); start += width {
  261. r: rune
  262. r, width = utf8.decode_rune(data[start:])
  263. if !is_space(r) {
  264. break
  265. }
  266. }
  267. for width, i := 0, start; i < len(data); i += width {
  268. r: rune
  269. r, width = utf8.decode_rune(data[i:])
  270. if is_space(r) {
  271. advance = i+width
  272. token = data[start:i]
  273. return
  274. }
  275. }
  276. if at_eof && len(data) > start {
  277. advance = len(data)
  278. token = data[start:]
  279. return
  280. }
  281. advance = start
  282. return
  283. }
  284. // scan_lines is a splitting procedure that returns each line of text stripping of any trailing newline and an optional preceding carriage return (\r?\n).
  285. // A new line is allowed to be empty.
  286. scan_lines :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
  287. trim_carriage_return :: proc "contextless" (data: []byte) -> []byte {
  288. if len(data) > 0 && data[len(data)-1] == '\r' {
  289. return data[0:len(data)-1]
  290. }
  291. return data
  292. }
  293. if at_eof && len(data) == 0 {
  294. return
  295. }
  296. if i := bytes.index_byte(data, '\n'); i >= 0 {
  297. advance = i+1
  298. token = trim_carriage_return(data[0:i])
  299. return
  300. }
  301. if at_eof {
  302. advance = len(data)
  303. token = trim_carriage_return(data)
  304. }
  305. return
  306. }