tokenizer.odin 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. package encoding_xml
  2. /*
  3. An XML 1.0 / 1.1 parser
  4. Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  7. List of contributors:
  8. Jeroen van Rijn: Initial implementation.
  9. */
  10. import "core:fmt"
  11. import "core:unicode"
  12. import "core:unicode/utf8"
  13. import "core:strings"
  14. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  15. Token :: struct {
  16. kind: Token_Kind,
  17. text: string,
  18. pos: Pos,
  19. }
  20. Pos :: struct {
  21. file: string,
  22. offset: int, // starting at 0
  23. line: int, // starting at 1
  24. column: int, // starting at 1
  25. }
  26. Token_Kind :: enum {
  27. Invalid,
  28. Ident,
  29. Literal,
  30. Rune,
  31. String,
  32. Double_Quote, // "
  33. Single_Quote, // '
  34. Colon, // :
  35. Eq, // =
  36. Lt, // <
  37. Gt, // >
  38. Exclaim, // !
  39. Question, // ?
  40. Hash, // #
  41. Slash, // /
  42. Dash, // -
  43. Open_Bracket, // [
  44. Close_Bracket, // ]
  45. EOF,
  46. }
  47. CDATA_START :: "<![CDATA["
  48. CDATA_END :: "]]>"
  49. COMMENT_START :: "<!--"
  50. COMMENT_END :: "-->"
  51. Tokenizer :: struct {
  52. // Immutable data
  53. path: string,
  54. src: string,
  55. err: Error_Handler,
  56. // Tokenizing state
  57. ch: rune,
  58. offset: int,
  59. read_offset: int,
  60. line_offset: int,
  61. line_count: int,
  62. // Mutable data
  63. error_count: int,
  64. }
  65. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  66. t.src = src
  67. t.err = err
  68. t.ch = ' '
  69. t.offset = 0
  70. t.read_offset = 0
  71. t.line_offset = 0
  72. t.line_count = len(src) > 0 ? 1 : 0
  73. t.error_count = 0
  74. t.path = path
  75. advance_rune(t)
  76. if t.ch == utf8.RUNE_BOM {
  77. advance_rune(t)
  78. }
  79. }
  80. @(private)
  81. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  82. line := t.line_count
  83. column := offset - t.line_offset + 1
  84. return Pos {
  85. file = t.path,
  86. offset = offset,
  87. line = line,
  88. column = column,
  89. }
  90. }
  91. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  92. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  93. fmt.eprintf(msg, ..args)
  94. fmt.eprintf("\n")
  95. }
  96. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  97. pos := offset_to_pos(t, offset)
  98. if t.err != nil {
  99. t.err(pos=pos, fmt=msg, args=args)
  100. }
  101. t.error_count += 1
  102. }
  103. @(optimization_mode="favor_size")
  104. advance_rune :: proc(t: ^Tokenizer) {
  105. #no_bounds_check {
  106. /*
  107. Already bounds-checked here.
  108. */
  109. if t.read_offset < len(t.src) {
  110. t.offset = t.read_offset
  111. if t.ch == '\n' {
  112. t.line_offset = t.offset
  113. t.line_count += 1
  114. }
  115. r, w := rune(t.src[t.read_offset]), 1
  116. switch {
  117. case r == 0:
  118. error(t, t.offset, "illegal character NUL")
  119. case r >= utf8.RUNE_SELF:
  120. r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:])
  121. if r == utf8.RUNE_ERROR && w == 1 {
  122. error(t, t.offset, "illegal UTF-8 encoding")
  123. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  124. error(t, t.offset, "illegal byte order mark")
  125. }
  126. }
  127. t.read_offset += w
  128. t.ch = r
  129. } else {
  130. t.offset = len(t.src)
  131. if t.ch == '\n' {
  132. t.line_offset = t.offset
  133. t.line_count += 1
  134. }
  135. t.ch = -1
  136. }
  137. }
  138. }
  139. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  140. if t.read_offset+offset < len(t.src) {
  141. #no_bounds_check return t.src[t.read_offset+offset]
  142. }
  143. return 0
  144. }
  145. @(optimization_mode="favor_size")
  146. skip_whitespace :: proc(t: ^Tokenizer) {
  147. for {
  148. switch t.ch {
  149. case ' ', '\t', '\r', '\n':
  150. advance_rune(t)
  151. case:
  152. return
  153. }
  154. }
  155. }
  156. @(optimization_mode="favor_size")
  157. is_letter :: proc(r: rune) -> bool {
  158. if r < utf8.RUNE_SELF {
  159. switch r {
  160. case '_':
  161. return true
  162. case 'A'..='Z', 'a'..='z':
  163. return true
  164. }
  165. }
  166. return unicode.is_letter(r)
  167. }
  168. is_valid_identifier_rune :: proc(r: rune) -> bool {
  169. if r < utf8.RUNE_SELF {
  170. switch r {
  171. case '_', '-', ':': return true
  172. case 'A'..='Z', 'a'..='z': return true
  173. case '0'..='9': return true
  174. case -1: return false
  175. }
  176. }
  177. if unicode.is_letter(r) || unicode.is_digit(r) {
  178. return true
  179. }
  180. return false
  181. }
  182. scan_identifier :: proc(t: ^Tokenizer) -> string {
  183. offset := t.offset
  184. namespaced := false
  185. for is_valid_identifier_rune(t.ch) {
  186. advance_rune(t)
  187. if t.ch == ':' {
  188. // A namespaced attr can have at most two parts, `namespace:ident`.
  189. if namespaced {
  190. break
  191. }
  192. namespaced = true
  193. }
  194. }
  195. return string(t.src[offset : t.offset])
  196. }
  197. /*
  198. A comment ends when we see -->, preceded by a character that's not a dash.
  199. "For compatibility, the string "--" (double-hyphen) must not occur within comments."
  200. See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
  201. Thanks to the length (4) of the comment start, we also have enough lookback,
  202. and the peek at the next byte asserts that there's at least one more character
  203. that's a `>`.
  204. */
  205. scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
  206. offset := t.offset
  207. for {
  208. advance_rune(t)
  209. ch := t.ch
  210. if ch < 0 {
  211. error(t, offset, "[parse] Comment was not terminated\n")
  212. return "", .Unclosed_Comment
  213. }
  214. if string(t.src[t.offset - 1:][:2]) == "--" {
  215. if peek_byte(t) == '>' {
  216. break
  217. } else {
  218. error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
  219. return "", .Invalid_Sequence_In_Comment
  220. }
  221. }
  222. }
  223. expect(t, .Dash)
  224. expect(t, .Gt)
  225. return string(t.src[offset : t.offset - 1]), .None
  226. }
  227. // Skip CDATA
  228. skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
  229. if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
  230. return .None
  231. }
  232. t.read_offset += len(CDATA_START)
  233. offset := t.offset
  234. cdata_scan: for {
  235. advance_rune(t)
  236. if t.ch < 0 {
  237. error(t, offset, "[scan_string] CDATA was not terminated\n")
  238. return .Premature_EOF
  239. }
  240. // Scan until the end of a CDATA tag.
  241. if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
  242. t.read_offset += len(CDATA_END)
  243. break cdata_scan
  244. }
  245. }
  246. return .None
  247. }
  248. @(optimization_mode="favor_size")
  249. scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
  250. err = .None
  251. loop: for {
  252. ch := t.ch
  253. switch ch {
  254. case -1:
  255. error(t, t.offset, "[scan_string] Premature end of file.\n")
  256. return "", .Premature_EOF
  257. case '<':
  258. if peek_byte(t) == '!' {
  259. if peek_byte(t, 1) == '[' {
  260. // Might be the start of a CDATA tag.
  261. skip_cdata(t) or_return
  262. } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
  263. // Comment start. Eat comment.
  264. t.read_offset += 3
  265. _ = scan_comment(t) or_return
  266. }
  267. }
  268. case '\n':
  269. if !multiline {
  270. error(t, offset, string(t.src[offset : t.offset]))
  271. error(t, offset, "[scan_string] Not terminated\n")
  272. err = .Invalid_Tag_Value
  273. break loop
  274. }
  275. }
  276. if t.ch == close {
  277. // If it's not a CDATA or comment, it's the end of this body.
  278. break loop
  279. }
  280. advance_rune(t)
  281. }
  282. // Strip trailing whitespace.
  283. lit := string(t.src[offset : t.offset])
  284. end := len(lit)
  285. eat: for ; end > 0; end -= 1 {
  286. ch := lit[end - 1]
  287. switch ch {
  288. case ' ', '\t', '\r', '\n':
  289. case:
  290. break eat
  291. }
  292. }
  293. lit = lit[:end]
  294. if consume_close {
  295. advance_rune(t)
  296. }
  297. return lit, err
  298. }
  299. peek :: proc(t: ^Tokenizer) -> (token: Token) {
  300. old := t^
  301. token = scan(t)
  302. t^ = old
  303. return token
  304. }
  305. scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
  306. skip_whitespace(t)
  307. offset := t.offset
  308. kind: Token_Kind
  309. err: Error
  310. lit: string
  311. pos := offset_to_pos(t, offset)
  312. switch ch := t.ch; true {
  313. case is_letter(ch):
  314. lit = scan_identifier(t)
  315. kind = .Ident
  316. case:
  317. advance_rune(t)
  318. switch ch {
  319. case -1:
  320. kind = .EOF
  321. case '<': kind = .Lt
  322. case '>': kind = .Gt
  323. case '!': kind = .Exclaim
  324. case '?': kind = .Question
  325. case '=': kind = .Eq
  326. case '#': kind = .Hash
  327. case '/': kind = .Slash
  328. case '-': kind = .Dash
  329. case ':': kind = .Colon
  330. case '[': kind = .Open_Bracket
  331. case ']': kind = .Close_Bracket
  332. case '"', '\'':
  333. kind = .Invalid
  334. lit, err = scan_string(t, t.offset, ch, true, multiline_string)
  335. if err == .None {
  336. kind = .String
  337. }
  338. case '\n':
  339. lit = "\n"
  340. case:
  341. kind = .Invalid
  342. }
  343. }
  344. if kind != .String && lit == "" {
  345. lit = string(t.src[offset : t.offset])
  346. }
  347. return Token{kind, lit, pos}
  348. }