xml_reader.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. package xml
  2. /*
  3. An XML 1.0 / 1.1 parser
  4. Copyright 2021 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  7. Features:
  8. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  9. - Simple to understand and use. Small.
  10. Caveats:
  11. - We do NOT support HTML in this package, as that may or may not be valid XML.
  12. If it works, great. If it doesn't, that's not considered a bug.
  13. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  14. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  15. TODO:
  16. - Optional CDATA unboxing.
  17. - Optional `&gt;`, `&#32;`, `&#x20;` and other escape substitution in tag bodies.
  18. MAYBE:
  19. - XML writer?
  20. - Serialize/deserialize Odin types?
  21. List of contributors:
  22. Jeroen van Rijn: Initial implementation.
  23. */
  24. import "core:strings"
  25. import "core:mem"
  26. import "core:os"
  27. DEFAULT_Options :: Options{
  28. flags = {
  29. .Ignore_Unsupported,
  30. },
  31. expected_doctype = "",
  32. }
  33. Option_Flag :: enum {
  34. /*
  35. Document MUST start with `<?xml` prolog.
  36. */
  37. Must_Have_Prolog,
  38. /*
  39. Document MUST have a `<!DOCTYPE`.
  40. */
  41. Must_Have_DocType,
  42. /*
  43. By default we skip comments. Use this option to intern a comment on a parented Element.
  44. */
  45. Intern_Comments,
  46. /*
  47. How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  48. */
  49. Error_on_Unsupported,
  50. Ignore_Unsupported,
  51. /*
  52. By default CDATA tags are passed-through as-is.
  53. This option unwraps them when encountered.
  54. */
  55. Unbox_CDATA,
  56. /*
  57. By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  58. This option decodes them when encountered.
  59. */
  60. Decode_SGML_Entities,
  61. }
  62. Option_Flags :: bit_set[Option_Flag; u8]
  63. Document :: struct {
  64. root: ^Element,
  65. prolog: Attributes,
  66. encoding: Encoding,
  67. doctype: struct {
  68. /*
  69. We only scan the <!DOCTYPE IDENT part and skip the rest.
  70. */
  71. ident: string,
  72. rest: string,
  73. },
  74. /*
  75. If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  76. Otherwise they'll be in the element tree.
  77. */
  78. comments: [dynamic]string,
  79. /*
  80. Internal
  81. */
  82. tokenizer: ^Tokenizer,
  83. allocator: mem.Allocator,
  84. intern: strings.Intern,
  85. }
  86. Element :: struct {
  87. ident: string,
  88. value: string,
  89. attribs: Attributes,
  90. kind: enum {
  91. Element = 0,
  92. Comment,
  93. },
  94. parent: ^Element,
  95. children: [dynamic]^Element,
  96. }
  97. Attr :: struct {
  98. key: string,
  99. val: string,
  100. }
  101. Attributes :: [dynamic]Attr
  102. Options :: struct {
  103. flags: Option_Flags,
  104. expected_doctype: string,
  105. }
  106. Encoding :: enum {
  107. Unknown,
  108. UTF_8,
  109. ISO_8859_1,
  110. /*
  111. Aliases
  112. */
  113. LATIN_1 = ISO_8859_1,
  114. }
  115. Error :: enum {
  116. /*
  117. General return values.
  118. */
  119. None = 0,
  120. General_Error,
  121. Unexpected_Token,
  122. Invalid_Token,
  123. /*
  124. Couldn't find, open or read file.
  125. */
  126. File_Error,
  127. /*
  128. File too short.
  129. */
  130. Premature_EOF,
  131. /*
  132. XML-specific errors.
  133. */
  134. No_Prolog,
  135. Invalid_Prolog,
  136. Too_Many_Prologs,
  137. No_DocType,
  138. Too_Many_DocTypes,
  139. DocType_Must_Proceed_Elements,
  140. /*
  141. If a DOCTYPE is present _or_ the caller
  142. asked for a specific DOCTYPE and the DOCTYPE
  143. and root tag don't match, we return `.Invalid_DocType`.
  144. */
  145. Invalid_DocType,
  146. Invalid_Tag_Value,
  147. Mismatched_Closing_Tag,
  148. Unclosed_Comment,
  149. Comment_Before_Root_Element,
  150. Invalid_Sequence_In_Comment,
  151. Unsupported_Version,
  152. Unsupported_Encoding,
  153. /*
  154. <!FOO are usually skipped.
  155. */
  156. Unhandled_Bang,
  157. Duplicate_Attribute,
  158. Conflicting_Options,
  159. /*
  160. Unhandled TODO:
  161. */
  162. Unhandled_CDATA_Unboxing,
  163. Unhandled_SGML_Entity_Decoding,
  164. }
  165. /*
  166. Implementation starts here.
  167. */
  168. parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  169. context.allocator = allocator
  170. opts := validate_options(options) or_return
  171. t := &Tokenizer{}
  172. init(t, string(data), path, error_handler)
  173. doc = new(Document)
  174. doc.allocator = allocator
  175. doc.tokenizer = t
  176. strings.intern_init(&doc.intern, allocator, allocator)
  177. err = .Unexpected_Token
  178. element, parent: ^Element
  179. tag_is_open := false
  180. /*
  181. If a DOCTYPE is present, the root tag has to match.
  182. If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  183. */
  184. expected_doctype := options.expected_doctype
  185. loop: for {
  186. skip_whitespace(t)
  187. switch t.ch {
  188. case '<':
  189. /*
  190. Consume peeked `<`
  191. */
  192. tok := scan(t)
  193. open := scan(t)
  194. #partial switch open.kind {
  195. case .Question:
  196. /*
  197. <?xml
  198. */
  199. next := scan(t)
  200. #partial switch next.kind {
  201. case .Ident:
  202. if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
  203. parse_prolog(doc) or_return
  204. } else if len(doc.prolog) > 0 {
  205. /*
  206. We've already seen a prolog.
  207. */
  208. return doc, .Too_Many_Prologs
  209. } else {
  210. /*
  211. Could be `<?xml-stylesheet`, etc. Ignore it.
  212. */
  213. skip_element(t) or_return
  214. }
  215. case:
  216. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
  217. return
  218. }
  219. case .Exclaim:
  220. /*
  221. <!
  222. */
  223. next := scan(t)
  224. #partial switch next.kind {
  225. case .Ident:
  226. switch next.text {
  227. case "DOCTYPE":
  228. if len(doc.doctype.ident) > 0 {
  229. return doc, .Too_Many_DocTypes
  230. }
  231. if doc.root != nil {
  232. return doc, .DocType_Must_Proceed_Elements
  233. }
  234. parse_doctype(doc) or_return
  235. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  236. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  237. return doc, .Invalid_DocType
  238. }
  239. expected_doctype = doc.doctype.ident
  240. case:
  241. if .Error_on_Unsupported in opts.flags {
  242. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  243. err = .Unhandled_Bang
  244. return
  245. }
  246. skip_element(t) or_return
  247. }
  248. case .Dash:
  249. /*
  250. Comment: <!-- -->.
  251. The grammar does not allow a comment to end in --->
  252. */
  253. expect(t, .Dash)
  254. offset := t.offset
  255. for {
  256. advance_rune(t)
  257. ch := t.ch
  258. /*
  259. A comment ends when we see -->, preceded by a character that's not a dash.
  260. "For compatibility, the string "--" (double-hyphen) must not occur within comments."
  261. See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
  262. Thanks to the length (4) of the comment start, we also have enough lookback,
  263. and the peek at the next byte asserts that there's at least one more character
  264. that's a `>`.
  265. */
  266. if ch < 0 {
  267. error(t, offset, "[parse] Comment was not terminated\n")
  268. return doc, .Unclosed_Comment
  269. }
  270. if string(t.src[t.offset - 1:][:2]) == "--" {
  271. if peek_byte(t) == '>' {
  272. break
  273. } else {
  274. error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
  275. return doc, .Invalid_Sequence_In_Comment
  276. }
  277. }
  278. }
  279. if .Intern_Comments in opts.flags {
  280. comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
  281. if doc.root == nil {
  282. append(&doc.comments, comment)
  283. } else {
  284. el := new(Element)
  285. el.parent = element
  286. el.kind = .Comment
  287. el.value = comment
  288. append(&element.children, el)
  289. }
  290. }
  291. expect(t, .Dash)
  292. expect(t, .Gt)
  293. case:
  294. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  295. return
  296. }
  297. case .Ident:
  298. /*
  299. e.g. <odin - Start of new element.
  300. */
  301. element = new(Element)
  302. tag_is_open = true
  303. if doc.root == nil {
  304. /*
  305. First element.
  306. */
  307. doc.root = element
  308. parent = element
  309. } else {
  310. append(&parent.children, element)
  311. }
  312. element.parent = parent
  313. element.ident = strings.intern_get(&doc.intern, open.text)
  314. parse_attributes(doc, &element.attribs) or_return
  315. /*
  316. If a DOCTYPE is present _or_ the caller
  317. asked for a specific DOCTYPE and the DOCTYPE
  318. and root tag don't match, we return .Invalid_Root_Tag.
  319. */
  320. if element == doc.root {
  321. if len(expected_doctype) > 0 && expected_doctype != open.text {
  322. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  323. return doc, .Invalid_DocType
  324. }
  325. }
  326. /*
  327. One of these should follow:
  328. - `>`, which means we've just opened this tag and expect a later element to close it.
  329. - `/>`, which means this is an 'empty' or self-closing tag.
  330. */
  331. end_token := scan(t)
  332. #partial switch end_token.kind {
  333. case .Gt:
  334. /*
  335. We're now the new parent.
  336. */
  337. parent = element
  338. case .Slash:
  339. /*
  340. Empty tag. Close it.
  341. */
  342. expect(t, .Gt) or_return
  343. parent = element.parent
  344. element = parent
  345. tag_is_open = false
  346. case:
  347. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  348. return
  349. }
  350. case .Slash:
  351. /*
  352. Close tag.
  353. */
  354. ident := expect(t, .Ident) or_return
  355. _ = expect(t, .Gt) or_return
  356. if element.ident != ident.text {
  357. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
  358. return doc, .Mismatched_Closing_Tag
  359. }
  360. parent = element.parent
  361. element = parent
  362. tag_is_open = false
  363. case:
  364. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  365. return
  366. }
  367. case -1:
  368. /*
  369. End of file.
  370. */
  371. if tag_is_open {
  372. return doc, .Premature_EOF
  373. }
  374. break loop
  375. case:
  376. /*
  377. This should be a tag's body text.
  378. */
  379. body_text := scan_string(t, t.offset) or_return
  380. element.value = strings.intern_get(&doc.intern, body_text)
  381. }
  382. }
  383. if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
  384. return doc, .No_Prolog
  385. }
  386. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  387. return doc, .No_DocType
  388. }
  389. return doc, .None
  390. }
  391. parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  392. context.allocator = allocator
  393. data, data_ok := os.read_entire_file(filename)
  394. defer delete(data)
  395. if !data_ok { return {}, .File_Error }
  396. return parse_from_slice(data, options, filename, error_handler, allocator)
  397. }
  398. parse :: proc { parse_from_file, parse_from_slice }
  399. free_element :: proc(element: ^Element) {
  400. if element == nil { return }
  401. for child in element.children {
  402. /*
  403. NOTE: Recursive.
  404. Could be rewritten so it adds them to a list of pointers to free.
  405. */
  406. free_element(child)
  407. }
  408. delete(element.attribs)
  409. delete(element.children)
  410. free(element)
  411. }
  412. destroy :: proc(doc: ^Document) {
  413. if doc == nil { return }
  414. free_element(doc.root)
  415. strings.intern_destroy(&doc.intern)
  416. delete(doc.prolog)
  417. delete(doc.comments)
  418. free(doc)
  419. }
  420. /*
  421. Helpers.
  422. */
  423. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  424. validated = options
  425. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  426. return options, .Conflicting_Options
  427. }
  428. if .Unbox_CDATA in validated.flags {
  429. return options, .Unhandled_CDATA_Unboxing
  430. }
  431. if .Decode_SGML_Entities in validated.flags {
  432. return options, .Unhandled_SGML_Entity_Decoding
  433. }
  434. return validated, .None
  435. }
  436. expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
  437. tok = scan(t)
  438. if tok.kind == kind { return tok, .None }
  439. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  440. return tok, .Unexpected_Token
  441. }
  442. parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
  443. assert(doc != nil)
  444. context.allocator = doc.allocator
  445. t := doc.tokenizer
  446. key := expect(t, .Ident) or_return
  447. offset = t.offset - len(key.text)
  448. _ = expect(t, .Eq) or_return
  449. value := expect(t, .String) or_return
  450. attr.key = strings.intern_get(&doc.intern, key.text)
  451. attr.val = strings.intern_get(&doc.intern, value.text)
  452. err = .None
  453. return
  454. }
  455. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
  456. for a in attribs {
  457. if attr.key == a.key {
  458. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  459. return .Duplicate_Attribute
  460. }
  461. }
  462. return .None
  463. }
  464. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  465. assert(doc != nil)
  466. context.allocator = doc.allocator
  467. t := doc.tokenizer
  468. for peek(t).kind == .Ident {
  469. attr, offset := parse_attribute(doc) or_return
  470. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  471. append(attribs, attr)
  472. }
  473. skip_whitespace(t)
  474. return .None
  475. }
  476. parse_prolog :: proc(doc: ^Document) -> (err: Error) {
  477. assert(doc != nil)
  478. context.allocator = doc.allocator
  479. t := doc.tokenizer
  480. offset := t.offset
  481. parse_attributes(doc, &doc.prolog) or_return
  482. for attr in doc.prolog {
  483. switch attr.key {
  484. case "version":
  485. switch attr.val {
  486. case "1.0", "1.1":
  487. case:
  488. error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
  489. }
  490. case "encoding":
  491. switch strings.to_lower(attr.val, context.temp_allocator) {
  492. case "utf-8", "utf8":
  493. doc.encoding = .UTF_8
  494. case "latin-1", "latin1", "iso-8859-1":
  495. doc.encoding = .LATIN_1
  496. case:
  497. /*
  498. Unrecognized encoding, assume UTF-8.
  499. */
  500. error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
  501. }
  502. case:
  503. // Ignored.
  504. }
  505. }
  506. _ = expect(t, .Question) or_return
  507. _ = expect(t, .Gt) or_return
  508. return .None
  509. }
  510. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  511. close := 1
  512. loop: for {
  513. tok := scan(t)
  514. #partial switch tok.kind {
  515. case .EOF:
  516. error(t, t.offset, "[skip_element] Premature EOF\n")
  517. return .Premature_EOF
  518. case .Lt:
  519. close += 1
  520. case .Gt:
  521. close -= 1
  522. if close == 0 {
  523. break loop
  524. }
  525. case:
  526. }
  527. }
  528. return .None
  529. }
  530. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  531. /*
  532. <!DOCTYPE greeting SYSTEM "hello.dtd">
  533. <!DOCTYPE greeting [
  534. <!ELEMENT greeting (#PCDATA)>
  535. ]>
  536. */
  537. assert(doc != nil)
  538. context.allocator = doc.allocator
  539. t := doc.tokenizer
  540. tok := expect(t, .Ident) or_return
  541. doc.doctype.ident = strings.intern_get(&doc.intern, tok.text)
  542. skip_whitespace(t)
  543. offset := t.offset
  544. skip_element(t) or_return
  545. /*
  546. -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  547. */
  548. doc.doctype.rest = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
  549. return .None
  550. }