tokenizer.odin 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. package encoding_xml
  2. /*
  3. An XML 1.0 / 1.1 parser
  4. Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  7. List of contributors:
  8. Jeroen van Rijn: Initial implementation.
  9. */
  10. import "core:fmt"
  11. import "core:unicode"
  12. import "core:unicode/utf8"
  13. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  14. Token :: struct {
  15. kind: Token_Kind,
  16. text: string,
  17. pos: Pos,
  18. }
  19. Pos :: struct {
  20. file: string,
  21. offset: int, // starting at 0
  22. line: int, // starting at 1
  23. column: int, // starting at 1
  24. }
  25. Token_Kind :: enum {
  26. Invalid,
  27. Ident,
  28. Literal,
  29. Rune,
  30. String,
  31. Double_Quote, // "
  32. Single_Quote, // '
  33. Colon, // :
  34. Eq, // =
  35. Lt, // <
  36. Gt, // >
  37. Exclaim, // !
  38. Question, // ?
  39. Hash, // #
  40. Slash, // /
  41. Dash, // -
  42. Open_Bracket, // [
  43. Close_Bracket, // ]
  44. EOF,
  45. }
  46. CDATA_START :: "<![CDATA["
  47. CDATA_END :: "]]>"
  48. COMMENT_START :: "<!--"
  49. COMMENT_END :: "-->"
  50. Tokenizer :: struct {
  51. // Immutable data
  52. path: string,
  53. src: string,
  54. err: Error_Handler,
  55. // Tokenizing state
  56. ch: rune,
  57. offset: int,
  58. read_offset: int,
  59. line_offset: int,
  60. line_count: int,
  61. // Mutable data
  62. error_count: int,
  63. }
  64. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  65. t.src = src
  66. t.err = err
  67. t.ch = ' '
  68. t.offset = 0
  69. t.read_offset = 0
  70. t.line_offset = 0
  71. t.line_count = len(src) > 0 ? 1 : 0
  72. t.error_count = 0
  73. t.path = path
  74. advance_rune(t)
  75. if t.ch == utf8.RUNE_BOM {
  76. advance_rune(t)
  77. }
  78. }
  79. @(private)
  80. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  81. line := t.line_count
  82. column := offset - t.line_offset + 1
  83. return Pos {
  84. file = t.path,
  85. offset = offset,
  86. line = line,
  87. column = column,
  88. }
  89. }
  90. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  91. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  92. fmt.eprintf(msg, ..args)
  93. fmt.eprintf("\n")
  94. }
  95. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  96. pos := offset_to_pos(t, offset)
  97. if t.err != nil {
  98. t.err(pos, msg, ..args)
  99. }
  100. t.error_count += 1
  101. }
  102. @(optimization_mode="favor_size")
  103. advance_rune :: proc(t: ^Tokenizer) {
  104. #no_bounds_check {
  105. /*
  106. Already bounds-checked here.
  107. */
  108. if t.read_offset < len(t.src) {
  109. t.offset = t.read_offset
  110. if t.ch == '\n' {
  111. t.line_offset = t.offset
  112. t.line_count += 1
  113. }
  114. r, w := rune(t.src[t.read_offset]), 1
  115. switch {
  116. case r == 0:
  117. error(t, t.offset, "illegal character NUL")
  118. case r >= utf8.RUNE_SELF:
  119. r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:])
  120. if r == utf8.RUNE_ERROR && w == 1 {
  121. error(t, t.offset, "illegal UTF-8 encoding")
  122. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  123. error(t, t.offset, "illegal byte order mark")
  124. }
  125. }
  126. t.read_offset += w
  127. t.ch = r
  128. } else {
  129. t.offset = len(t.src)
  130. if t.ch == '\n' {
  131. t.line_offset = t.offset
  132. t.line_count += 1
  133. }
  134. t.ch = -1
  135. }
  136. }
  137. }
  138. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  139. if t.read_offset+offset < len(t.src) {
  140. #no_bounds_check return t.src[t.read_offset+offset]
  141. }
  142. return 0
  143. }
  144. @(optimization_mode="favor_size")
  145. skip_whitespace :: proc(t: ^Tokenizer) {
  146. for {
  147. switch t.ch {
  148. case ' ', '\t', '\r', '\n':
  149. advance_rune(t)
  150. case:
  151. return
  152. }
  153. }
  154. }
  155. @(optimization_mode="favor_size")
  156. is_letter :: proc(r: rune) -> bool {
  157. if r < utf8.RUNE_SELF {
  158. switch r {
  159. case '_':
  160. return true
  161. case 'A'..='Z', 'a'..='z':
  162. return true
  163. }
  164. }
  165. return unicode.is_letter(r)
  166. }
  167. is_valid_identifier_rune :: proc(r: rune) -> bool {
  168. if r < utf8.RUNE_SELF {
  169. switch r {
  170. case '_', '-', ':': return true
  171. case 'A'..='Z', 'a'..='z': return true
  172. case '0'..='9': return true
  173. case -1: return false
  174. }
  175. }
  176. if unicode.is_letter(r) || unicode.is_digit(r) {
  177. return true
  178. }
  179. return false
  180. }
  181. scan_identifier :: proc(t: ^Tokenizer) -> string {
  182. offset := t.offset
  183. namespaced := false
  184. for is_valid_identifier_rune(t.ch) {
  185. advance_rune(t)
  186. if t.ch == ':' {
  187. // A namespaced attr can have at most two parts, `namespace:ident`.
  188. if namespaced {
  189. break
  190. }
  191. namespaced = true
  192. }
  193. }
  194. return string(t.src[offset : t.offset])
  195. }
  196. /*
  197. A comment ends when we see -->, preceded by a character that's not a dash.
  198. "For compatibility, the string "--" (double-hyphen) must not occur within comments."
  199. See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
  200. Thanks to the length (4) of the comment start, we also have enough lookback,
  201. and the peek at the next byte asserts that there's at least one more character
  202. that's a `>`.
  203. */
  204. scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
  205. offset := t.offset
  206. for {
  207. advance_rune(t)
  208. ch := t.ch
  209. if ch < 0 {
  210. error(t, offset, "[parse] Comment was not terminated\n")
  211. return "", .Unclosed_Comment
  212. }
  213. if string(t.src[t.offset - 1:][:2]) == "--" {
  214. if peek_byte(t) == '>' {
  215. break
  216. } else {
  217. error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
  218. return "", .Invalid_Sequence_In_Comment
  219. }
  220. }
  221. }
  222. expect(t, .Dash)
  223. expect(t, .Gt)
  224. return string(t.src[offset : t.offset - 1]), .None
  225. }
  226. // Skip CDATA
  227. skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
  228. if t.read_offset + len(CDATA_START) >= len(t.src) {
  229. // Can't be the start of a CDATA tag.
  230. return .None
  231. }
  232. if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
  233. t.read_offset += len(CDATA_START)
  234. offset := t.offset
  235. cdata_scan: for {
  236. advance_rune(t)
  237. if t.ch < 0 {
  238. error(t, offset, "[scan_string] CDATA was not terminated\n")
  239. return .Premature_EOF
  240. }
  241. // Scan until the end of a CDATA tag.
  242. if t.read_offset + len(CDATA_END) < len(t.src) {
  243. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  244. t.read_offset += len(CDATA_END)
  245. break cdata_scan
  246. }
  247. }
  248. }
  249. }
  250. return
  251. }
  252. @(optimization_mode="favor_size")
  253. scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
  254. err = .None
  255. loop: for {
  256. ch := t.ch
  257. switch ch {
  258. case -1:
  259. error(t, t.offset, "[scan_string] Premature end of file.\n")
  260. return "", .Premature_EOF
  261. case '<':
  262. if peek_byte(t) == '!' {
  263. if peek_byte(t, 1) == '[' {
  264. // Might be the start of a CDATA tag.
  265. skip_cdata(t) or_return
  266. } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
  267. // Comment start. Eat comment.
  268. t.read_offset += 3
  269. _ = scan_comment(t) or_return
  270. }
  271. }
  272. case '\n':
  273. if !multiline {
  274. error(t, offset, string(t.src[offset : t.offset]))
  275. error(t, offset, "[scan_string] Not terminated\n")
  276. err = .Invalid_Tag_Value
  277. break loop
  278. }
  279. }
  280. if t.ch == close {
  281. // If it's not a CDATA or comment, it's the end of this body.
  282. break loop
  283. }
  284. advance_rune(t)
  285. }
  286. // Strip trailing whitespace.
  287. lit := string(t.src[offset : t.offset])
  288. end := len(lit)
  289. eat: for ; end > 0; end -= 1 {
  290. ch := lit[end - 1]
  291. switch ch {
  292. case ' ', '\t', '\r', '\n':
  293. case:
  294. break eat
  295. }
  296. }
  297. lit = lit[:end]
  298. if consume_close {
  299. advance_rune(t)
  300. }
  301. return lit, err
  302. }
  303. peek :: proc(t: ^Tokenizer) -> (token: Token) {
  304. old := t^
  305. token = scan(t)
  306. t^ = old
  307. return token
  308. }
  309. scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
  310. skip_whitespace(t)
  311. offset := t.offset
  312. kind: Token_Kind
  313. err: Error
  314. lit: string
  315. pos := offset_to_pos(t, offset)
  316. switch ch := t.ch; true {
  317. case is_letter(ch):
  318. lit = scan_identifier(t)
  319. kind = .Ident
  320. case:
  321. advance_rune(t)
  322. switch ch {
  323. case -1:
  324. kind = .EOF
  325. case '<': kind = .Lt
  326. case '>': kind = .Gt
  327. case '!': kind = .Exclaim
  328. case '?': kind = .Question
  329. case '=': kind = .Eq
  330. case '#': kind = .Hash
  331. case '/': kind = .Slash
  332. case '-': kind = .Dash
  333. case ':': kind = .Colon
  334. case '"', '\'':
  335. kind = .Invalid
  336. lit, err = scan_string(t, t.offset, ch, true, multiline_string)
  337. if err == .None {
  338. kind = .String
  339. }
  340. case '\n':
  341. lit = "\n"
  342. case:
  343. kind = .Invalid
  344. }
  345. }
  346. if kind != .String && lit == "" {
  347. lit = string(t.src[offset : t.offset])
  348. }
  349. return Token{kind, lit, pos}
  350. }