tokenizer.odin 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. package xml
  2. /*
  3. An XML 1.0 / 1.1 parser
  4. Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  7. List of contributors:
  8. Jeroen van Rijn: Initial implementation.
  9. */
  10. import "core:fmt"
  11. import "core:unicode"
  12. import "core:unicode/utf8"
  13. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  14. Token :: struct {
  15. kind: Token_Kind,
  16. text: string,
  17. pos: Pos,
  18. }
  19. Pos :: struct {
  20. file: string,
  21. offset: int, // starting at 0
  22. line: int, // starting at 1
  23. column: int, // starting at 1
  24. }
  25. Token_Kind :: enum {
  26. Invalid,
  27. Ident,
  28. Literal,
  29. Rune,
  30. String,
  31. Double_Quote, // "
  32. Single_Quote, // '
  33. Colon, // :
  34. Eq, // =
  35. Lt, // <
  36. Gt, // >
  37. Exclaim, // !
  38. Question, // ?
  39. Hash, // #
  40. Slash, // /
  41. Dash, // -
  42. Open_Bracket, // [
  43. Close_Bracket, // ]
  44. EOF,
  45. }
  46. CDATA_START :: "<![CDATA["
  47. CDATA_END :: "]]>"
  48. COMMENT_START :: "<!--"
  49. COMMENT_END :: "-->"
  50. Tokenizer :: struct {
  51. // Immutable data
  52. path: string,
  53. src: string,
  54. err: Error_Handler,
  55. // Tokenizing state
  56. ch: rune,
  57. offset: int,
  58. read_offset: int,
  59. line_offset: int,
  60. line_count: int,
  61. // Mutable data
  62. error_count: int,
  63. }
  64. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  65. t.src = src
  66. t.err = err
  67. t.ch = ' '
  68. t.offset = 0
  69. t.read_offset = 0
  70. t.line_offset = 0
  71. t.line_count = len(src) > 0 ? 1 : 0
  72. t.error_count = 0
  73. t.path = path
  74. advance_rune(t)
  75. if t.ch == utf8.RUNE_BOM {
  76. advance_rune(t)
  77. }
  78. }
  79. @(private)
  80. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  81. line := t.line_count
  82. column := offset - t.line_offset + 1
  83. return Pos {
  84. file = t.path,
  85. offset = offset,
  86. line = line,
  87. column = column,
  88. }
  89. }
  90. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  91. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  92. fmt.eprintf(msg, ..args)
  93. fmt.eprintf("\n")
  94. }
  95. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  96. pos := offset_to_pos(t, offset)
  97. if t.err != nil {
  98. t.err(pos, msg, ..args)
  99. }
  100. t.error_count += 1
  101. }
  102. @(optimization_mode="speed")
  103. advance_rune :: proc(t: ^Tokenizer) {
  104. #no_bounds_check {
  105. /*
  106. Already bounds-checked here.
  107. */
  108. if t.read_offset < len(t.src) {
  109. t.offset = t.read_offset
  110. if t.ch == '\n' {
  111. t.line_offset = t.offset
  112. t.line_count += 1
  113. }
  114. r, w := rune(t.src[t.read_offset]), 1
  115. switch {
  116. case r == 0:
  117. error(t, t.offset, "illegal character NUL")
  118. case r >= utf8.RUNE_SELF:
  119. r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:])
  120. if r == utf8.RUNE_ERROR && w == 1 {
  121. error(t, t.offset, "illegal UTF-8 encoding")
  122. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  123. error(t, t.offset, "illegal byte order mark")
  124. }
  125. }
  126. t.read_offset += w
  127. t.ch = r
  128. } else {
  129. t.offset = len(t.src)
  130. if t.ch == '\n' {
  131. t.line_offset = t.offset
  132. t.line_count += 1
  133. }
  134. t.ch = -1
  135. }
  136. }
  137. }
  138. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  139. if t.read_offset+offset < len(t.src) {
  140. #no_bounds_check return t.src[t.read_offset+offset]
  141. }
  142. return 0
  143. }
  144. @(optimization_mode="speed")
  145. skip_whitespace :: proc(t: ^Tokenizer) {
  146. for {
  147. switch t.ch {
  148. case ' ', '\t', '\r', '\n':
  149. advance_rune(t)
  150. case:
  151. return
  152. }
  153. }
  154. }
  155. @(optimization_mode="speed")
  156. is_letter :: proc(r: rune) -> bool {
  157. if r < utf8.RUNE_SELF {
  158. switch r {
  159. case '_':
  160. return true
  161. case 'A'..='Z', 'a'..='z':
  162. return true
  163. }
  164. }
  165. return unicode.is_letter(r)
  166. }
  167. is_valid_identifier_rune :: proc(r: rune) -> bool {
  168. if r < utf8.RUNE_SELF {
  169. switch r {
  170. case '_', '-', ':': return true
  171. case 'A'..='Z', 'a'..='z': return true
  172. case '0'..='9': return true
  173. case -1: return false
  174. }
  175. }
  176. if unicode.is_letter(r) || unicode.is_digit(r) {
  177. return true
  178. }
  179. return false
  180. }
  181. scan_identifier :: proc(t: ^Tokenizer) -> string {
  182. offset := t.offset
  183. namespaced := false
  184. for is_valid_identifier_rune(t.ch) {
  185. advance_rune(t)
  186. if t.ch == ':' {
  187. /*
  188. A namespaced attr can have at most two parts, `namespace:ident`.
  189. */
  190. if namespaced {
  191. break
  192. }
  193. namespaced = true
  194. }
  195. }
  196. return string(t.src[offset : t.offset])
  197. }
  198. /*
  199. A comment ends when we see -->, preceded by a character that's not a dash.
  200. "For compatibility, the string "--" (double-hyphen) must not occur within comments."
  201. See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
  202. Thanks to the length (4) of the comment start, we also have enough lookback,
  203. and the peek at the next byte asserts that there's at least one more character
  204. that's a `>`.
  205. */
  206. scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
  207. offset := t.offset
  208. for {
  209. advance_rune(t)
  210. ch := t.ch
  211. if ch < 0 {
  212. error(t, offset, "[parse] Comment was not terminated\n")
  213. return "", .Unclosed_Comment
  214. }
  215. if string(t.src[t.offset - 1:][:2]) == "--" {
  216. if peek_byte(t) == '>' {
  217. break
  218. } else {
  219. error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
  220. return "", .Invalid_Sequence_In_Comment
  221. }
  222. }
  223. }
  224. expect(t, .Dash)
  225. expect(t, .Gt)
  226. return string(t.src[offset : t.offset - 1]), .None
  227. }
  228. /*
  229. Skip CDATA
  230. */
  231. skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
  232. if t.read_offset + len(CDATA_START) >= len(t.src) {
  233. /*
  234. Can't be the start of a CDATA tag.
  235. */
  236. return .None
  237. }
  238. if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
  239. t.read_offset += len(CDATA_START)
  240. offset := t.offset
  241. cdata_scan: for {
  242. advance_rune(t)
  243. if t.ch < 0 {
  244. error(t, offset, "[scan_string] CDATA was not terminated\n")
  245. return .Premature_EOF
  246. }
  247. /*
  248. Scan until the end of a CDATA tag.
  249. */
  250. if t.read_offset + len(CDATA_END) < len(t.src) {
  251. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  252. t.read_offset += len(CDATA_END)
  253. break cdata_scan
  254. }
  255. }
  256. }
  257. }
  258. return
  259. }
  260. @(optimization_mode="speed")
  261. scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
  262. err = .None
  263. loop: for {
  264. ch := t.ch
  265. switch ch {
  266. case -1:
  267. error(t, t.offset, "[scan_string] Premature end of file.\n")
  268. return "", .Premature_EOF
  269. case '<':
  270. if peek_byte(t) == '!' {
  271. if peek_byte(t, 1) == '[' {
  272. /*
  273. Might be the start of a CDATA tag.
  274. */
  275. skip_cdata(t) or_return
  276. } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
  277. /*
  278. Comment start. Eat comment.
  279. */
  280. t.read_offset += 3
  281. _ = scan_comment(t) or_return
  282. }
  283. }
  284. case '\n':
  285. if !multiline {
  286. error(t, offset, string(t.src[offset : t.offset]))
  287. error(t, offset, "[scan_string] Not terminated\n")
  288. err = .Invalid_Tag_Value
  289. break loop
  290. }
  291. }
  292. if t.ch == close {
  293. /*
  294. If it's not a CDATA or comment, it's the end of this body.
  295. */
  296. break loop
  297. }
  298. advance_rune(t)
  299. }
  300. /*
  301. Strip trailing whitespace.
  302. */
  303. lit := string(t.src[offset : t.offset])
  304. end := len(lit)
  305. eat: for ; end > 0; end -= 1 {
  306. ch := lit[end - 1]
  307. switch ch {
  308. case ' ', '\t', '\r', '\n':
  309. case:
  310. break eat
  311. }
  312. }
  313. lit = lit[:end]
  314. if consume_close {
  315. advance_rune(t)
  316. }
  317. /*
  318. TODO: Handle decoding escape characters and unboxing CDATA.
  319. */
  320. return lit, err
  321. }
  322. peek :: proc(t: ^Tokenizer) -> (token: Token) {
  323. old := t^
  324. token = scan(t)
  325. t^ = old
  326. return token
  327. }
  328. scan :: proc(t: ^Tokenizer) -> Token {
  329. skip_whitespace(t)
  330. offset := t.offset
  331. kind: Token_Kind
  332. err: Error
  333. lit: string
  334. pos := offset_to_pos(t, offset)
  335. switch ch := t.ch; true {
  336. case is_letter(ch):
  337. lit = scan_identifier(t)
  338. kind = .Ident
  339. case:
  340. advance_rune(t)
  341. switch ch {
  342. case -1:
  343. kind = .EOF
  344. case '<': kind = .Lt
  345. case '>': kind = .Gt
  346. case '!': kind = .Exclaim
  347. case '?': kind = .Question
  348. case '=': kind = .Eq
  349. case '#': kind = .Hash
  350. case '/': kind = .Slash
  351. case '-': kind = .Dash
  352. case ':': kind = .Colon
  353. case '"', '\'':
  354. kind = .Invalid
  355. lit, err = scan_string(t, t.offset, ch, true, false)
  356. if err == .None {
  357. kind = .String
  358. }
  359. case '\n':
  360. lit = "\n"
  361. case:
  362. kind = .Invalid
  363. }
  364. }
  365. if kind != .String && lit == "" {
  366. lit = string(t.src[offset : t.offset])
  367. }
  368. return Token{kind, lit, pos}
  369. }