xml_reader.odin 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715
  1. /*
  2. An XML 1.0 / 1.1 parser
  3. Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  6. Features:
  7. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  8. - Simple to understand and use. Small.
  9. Caveats:
  10. - We do NOT support HTML in this package, as that may or may not be valid XML.
  11. If it works, great. If it doesn't, that's not considered a bug.
  12. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  13. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  14. MAYBE:
  15. - XML writer?
  16. - Serialize/deserialize Odin types?
  17. List of contributors:
  18. Jeroen van Rijn: Initial implementation.
  19. */
  20. package xml
  21. // An XML 1.0 / 1.1 parser
  22. import "core:bytes"
  23. import "core:encoding/entity"
  24. import "core:intrinsics"
  25. import "core:mem"
  26. import "core:os"
  27. import "core:strings"
  28. likely :: intrinsics.expect
  29. DEFAULT_Options :: Options{
  30. flags = {
  31. .Ignore_Unsupported,
  32. },
  33. expected_doctype = "",
  34. }
  35. Option_Flag :: enum {
  36. /*
  37. If the caller says that input may be modified, we can perform in-situ parsing.
  38. If this flag isn't provided, the XML parser first duplicates the input so that it can.
  39. */
  40. Input_May_Be_Modified,
  41. /*
  42. Document MUST start with `<?xml` prolog.
  43. */
  44. Must_Have_Prolog,
  45. /*
  46. Document MUST have a `<!DOCTYPE`.
  47. */
  48. Must_Have_DocType,
  49. /*
  50. By default we skip comments. Use this option to intern a comment on a parented Element.
  51. */
  52. Intern_Comments,
  53. /*
  54. How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  55. */
  56. Error_on_Unsupported,
  57. Ignore_Unsupported,
  58. /*
  59. By default CDATA tags are passed-through as-is.
  60. This option unwraps them when encountered.
  61. */
  62. Unbox_CDATA,
  63. /*
  64. By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  65. This option decodes them when encountered.
  66. */
  67. Decode_SGML_Entities,
  68. /*
  69. If a tag body has a comment, it will be stripped unless this option is given.
  70. */
  71. Keep_Tag_Body_Comments,
  72. }
  73. Option_Flags :: bit_set[Option_Flag; u16]
  74. Document :: struct {
  75. elements: [dynamic]Element,
  76. element_count: Element_ID,
  77. prolog: Attributes,
  78. encoding: Encoding,
  79. doctype: struct {
  80. /*
  81. We only scan the <!DOCTYPE IDENT part and skip the rest.
  82. */
  83. ident: string,
  84. rest: string,
  85. },
  86. /*
  87. If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  88. Otherwise they'll be in the element tree.
  89. */
  90. comments: [dynamic]string,
  91. /*
  92. Internal
  93. */
  94. tokenizer: ^Tokenizer,
  95. allocator: mem.Allocator,
  96. /*
  97. Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
  98. */
  99. input: []u8,
  100. strings_to_free: [dynamic]string,
  101. }
  102. Element :: struct {
  103. ident: string,
  104. value: string,
  105. attribs: Attributes,
  106. kind: enum {
  107. Element = 0,
  108. Comment,
  109. },
  110. parent: Element_ID,
  111. children: [dynamic]Element_ID,
  112. }
  113. Attr :: struct {
  114. key: string,
  115. val: string,
  116. }
  117. Attributes :: [dynamic]Attr
  118. Options :: struct {
  119. flags: Option_Flags,
  120. expected_doctype: string,
  121. }
  122. Encoding :: enum {
  123. Unknown,
  124. UTF_8,
  125. ISO_8859_1,
  126. /*
  127. Aliases
  128. */
  129. LATIN_1 = ISO_8859_1,
  130. }
  131. Error :: enum {
  132. /*
  133. General return values.
  134. */
  135. None = 0,
  136. General_Error,
  137. Unexpected_Token,
  138. Invalid_Token,
  139. /*
  140. Couldn't find, open or read file.
  141. */
  142. File_Error,
  143. /*
  144. File too short.
  145. */
  146. Premature_EOF,
  147. /*
  148. XML-specific errors.
  149. */
  150. No_Prolog,
  151. Invalid_Prolog,
  152. Too_Many_Prologs,
  153. No_DocType,
  154. Too_Many_DocTypes,
  155. DocType_Must_Preceed_Elements,
  156. /*
  157. If a DOCTYPE is present _or_ the caller
  158. asked for a specific DOCTYPE and the DOCTYPE
  159. and root tag don't match, we return `.Invalid_DocType`.
  160. */
  161. Invalid_DocType,
  162. Invalid_Tag_Value,
  163. Mismatched_Closing_Tag,
  164. Unclosed_Comment,
  165. Comment_Before_Root_Element,
  166. Invalid_Sequence_In_Comment,
  167. Unsupported_Version,
  168. Unsupported_Encoding,
  169. /*
  170. <!FOO are usually skipped.
  171. */
  172. Unhandled_Bang,
  173. Duplicate_Attribute,
  174. Conflicting_Options,
  175. }
  176. /*
  177. Implementation starts here.
  178. */
  179. parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  180. data := data
  181. context.allocator = allocator
  182. opts := validate_options(options) or_return
  183. /*
  184. If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
  185. */
  186. if .Input_May_Be_Modified not_in opts.flags {
  187. data = bytes.clone(data)
  188. }
  189. t := &Tokenizer{}
  190. init(t, string(data), path, error_handler)
  191. doc = new(Document)
  192. doc.allocator = allocator
  193. doc.tokenizer = t
  194. doc.input = data
  195. doc.elements = make([dynamic]Element, 1024, 1024, allocator)
  196. // strings.intern_init(&doc.intern, allocator, allocator)
  197. err = .Unexpected_Token
  198. element, parent: Element_ID
  199. tag_is_open := false
  200. first_element := true
  201. open: Token
  202. /*
  203. If a DOCTYPE is present, the root tag has to match.
  204. If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  205. */
  206. expected_doctype := options.expected_doctype
  207. loop: for {
  208. skip_whitespace(t)
  209. // NOTE(Jeroen): This is faster as a switch.
  210. switch t.ch {
  211. case '<':
  212. /*
  213. Consume peeked `<`
  214. */
  215. advance_rune(t)
  216. open = scan(t)
  217. // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
  218. if likely(open.kind, Token_Kind.Ident) == .Ident {
  219. /*
  220. e.g. <odin - Start of new element.
  221. */
  222. element = new_element(doc)
  223. tag_is_open = true
  224. if first_element {
  225. /*
  226. First element.
  227. */
  228. parent = element
  229. first_element = false
  230. } else {
  231. append(&doc.elements[parent].children, element)
  232. }
  233. doc.elements[element].parent = parent
  234. doc.elements[element].ident = open.text
  235. parse_attributes(doc, &doc.elements[element].attribs) or_return
  236. /*
  237. If a DOCTYPE is present _or_ the caller
  238. asked for a specific DOCTYPE and the DOCTYPE
  239. and root tag don't match, we return .Invalid_Root_Tag.
  240. */
  241. if element == 0 { // Root tag?
  242. if len(expected_doctype) > 0 && expected_doctype != open.text {
  243. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  244. return doc, .Invalid_DocType
  245. }
  246. }
  247. /*
  248. One of these should follow:
  249. - `>`, which means we've just opened this tag and expect a later element to close it.
  250. - `/>`, which means this is an 'empty' or self-closing tag.
  251. */
  252. end_token := scan(t)
  253. #partial switch end_token.kind {
  254. case .Gt:
  255. /*
  256. We're now the new parent.
  257. */
  258. parent = element
  259. case .Slash:
  260. /*
  261. Empty tag. Close it.
  262. */
  263. expect(t, .Gt) or_return
  264. parent = doc.elements[element].parent
  265. element = parent
  266. tag_is_open = false
  267. case:
  268. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  269. return
  270. }
  271. } else if open.kind == .Slash {
  272. /*
  273. Close tag.
  274. */
  275. ident := expect(t, .Ident) or_return
  276. _ = expect(t, .Gt) or_return
  277. if doc.elements[element].ident != ident.text {
  278. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
  279. return doc, .Mismatched_Closing_Tag
  280. }
  281. parent = doc.elements[element].parent
  282. element = parent
  283. tag_is_open = false
  284. } else if open.kind == .Exclaim {
  285. /*
  286. <!
  287. */
  288. next := scan(t)
  289. #partial switch next.kind {
  290. case .Ident:
  291. switch next.text {
  292. case "DOCTYPE":
  293. if len(doc.doctype.ident) > 0 {
  294. return doc, .Too_Many_DocTypes
  295. }
  296. if doc.element_count > 0 {
  297. return doc, .DocType_Must_Preceed_Elements
  298. }
  299. parse_doctype(doc) or_return
  300. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  301. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  302. return doc, .Invalid_DocType
  303. }
  304. expected_doctype = doc.doctype.ident
  305. case:
  306. if .Error_on_Unsupported in opts.flags {
  307. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  308. return doc, .Unhandled_Bang
  309. }
  310. skip_element(t) or_return
  311. }
  312. case .Dash:
  313. /*
  314. Comment: <!-- -->.
  315. The grammar does not allow a comment to end in --->
  316. */
  317. expect(t, .Dash)
  318. comment := scan_comment(t) or_return
  319. if .Intern_Comments in opts.flags {
  320. if len(doc.elements) == 0 {
  321. append(&doc.comments, comment)
  322. } else {
  323. el := new_element(doc)
  324. doc.elements[el].parent = element
  325. doc.elements[el].kind = .Comment
  326. doc.elements[el].value = comment
  327. append(&doc.elements[element].children, el)
  328. }
  329. }
  330. case:
  331. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  332. return
  333. }
  334. } else if open.kind == .Question {
  335. /*
  336. <?xml
  337. */
  338. next := scan(t)
  339. #partial switch next.kind {
  340. case .Ident:
  341. if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
  342. parse_prolog(doc) or_return
  343. } else if len(doc.prolog) > 0 {
  344. /*
  345. We've already seen a prolog.
  346. */
  347. return doc, .Too_Many_Prologs
  348. } else {
  349. /*
  350. Could be `<?xml-stylesheet`, etc. Ignore it.
  351. */
  352. skip_element(t) or_return
  353. }
  354. case:
  355. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  356. return
  357. }
  358. } else {
  359. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  360. return
  361. }
  362. case -1:
  363. /*
  364. End of file.
  365. */
  366. if tag_is_open {
  367. return doc, .Premature_EOF
  368. }
  369. break loop
  370. case:
  371. /*
  372. This should be a tag's body text.
  373. */
  374. body_text := scan_string(t, t.offset) or_return
  375. needs_processing := .Unbox_CDATA in opts.flags
  376. needs_processing |= .Decode_SGML_Entities in opts.flags
  377. if !needs_processing {
  378. doc.elements[element].value = body_text
  379. continue
  380. }
  381. decode_opts := entity.XML_Decode_Options{}
  382. if .Keep_Tag_Body_Comments not_in opts.flags {
  383. decode_opts += { .Comment_Strip }
  384. }
  385. if .Decode_SGML_Entities not_in opts.flags {
  386. decode_opts += { .No_Entity_Decode }
  387. }
  388. if .Unbox_CDATA in opts.flags {
  389. decode_opts += { .Unbox_CDATA }
  390. if .Decode_SGML_Entities in opts.flags {
  391. decode_opts += { .Decode_CDATA }
  392. }
  393. }
  394. decoded, decode_err := entity.decode_xml(body_text, decode_opts)
  395. if decode_err == .None {
  396. doc.elements[element].value = decoded
  397. append(&doc.strings_to_free, decoded)
  398. } else {
  399. doc.elements[element].value = body_text
  400. }
  401. }
  402. }
  403. if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
  404. return doc, .No_Prolog
  405. }
  406. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  407. return doc, .No_DocType
  408. }
  409. resize(&doc.elements, int(doc.element_count))
  410. return doc, .None
  411. }
  412. parse_from_string :: proc(data: string, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  413. _data := transmute([]u8)data
  414. return parse_from_slice(_data, options, path, error_handler, allocator)
  415. }
  416. parse :: proc { parse_from_string, parse_from_slice }
  417. // Load an XML file
  418. load_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  419. context.allocator = allocator
  420. options := options
  421. data, data_ok := os.read_entire_file(filename)
  422. if !data_ok { return {}, .File_Error }
  423. options.flags += { .Input_May_Be_Modified }
  424. return parse_from_slice(data, options, filename, error_handler, allocator)
  425. }
  426. destroy :: proc(doc: ^Document) {
  427. if doc == nil { return }
  428. for el in doc.elements {
  429. delete(el.attribs)
  430. delete(el.children)
  431. }
  432. delete(doc.elements)
  433. delete(doc.prolog)
  434. delete(doc.comments)
  435. delete(doc.input)
  436. for s in doc.strings_to_free {
  437. delete(s)
  438. }
  439. delete(doc.strings_to_free)
  440. free(doc)
  441. }
  442. /*
  443. Helpers.
  444. */
  445. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  446. validated = options
  447. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  448. return options, .Conflicting_Options
  449. }
  450. return validated, .None
  451. }
  452. expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
  453. tok = scan(t)
  454. if tok.kind == kind { return tok, .None }
  455. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  456. return tok, .Unexpected_Token
  457. }
  458. parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
  459. assert(doc != nil)
  460. context.allocator = doc.allocator
  461. t := doc.tokenizer
  462. key := expect(t, .Ident) or_return
  463. offset = t.offset - len(key.text)
  464. _ = expect(t, .Eq) or_return
  465. value := expect(t, .String) or_return
  466. attr.key = key.text
  467. attr.val = value.text
  468. err = .None
  469. return
  470. }
  471. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
  472. for a in attribs {
  473. if attr.key == a.key {
  474. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  475. return .Duplicate_Attribute
  476. }
  477. }
  478. return .None
  479. }
  480. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  481. assert(doc != nil)
  482. context.allocator = doc.allocator
  483. t := doc.tokenizer
  484. for peek(t).kind == .Ident {
  485. attr, offset := parse_attribute(doc) or_return
  486. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  487. append(attribs, attr)
  488. }
  489. skip_whitespace(t)
  490. return .None
  491. }
  492. parse_prolog :: proc(doc: ^Document) -> (err: Error) {
  493. assert(doc != nil)
  494. context.allocator = doc.allocator
  495. t := doc.tokenizer
  496. offset := t.offset
  497. parse_attributes(doc, &doc.prolog) or_return
  498. for attr in doc.prolog {
  499. switch attr.key {
  500. case "version":
  501. switch attr.val {
  502. case "1.0", "1.1":
  503. case:
  504. error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
  505. }
  506. case "encoding":
  507. switch strings.to_lower(attr.val, context.temp_allocator) {
  508. case "utf-8", "utf8":
  509. doc.encoding = .UTF_8
  510. case "latin-1", "latin1", "iso-8859-1":
  511. doc.encoding = .LATIN_1
  512. case:
  513. /*
  514. Unrecognized encoding, assume UTF-8.
  515. */
  516. error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
  517. }
  518. case:
  519. // Ignored.
  520. }
  521. }
  522. _ = expect(t, .Question) or_return
  523. _ = expect(t, .Gt) or_return
  524. return .None
  525. }
  526. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  527. close := 1
  528. loop: for {
  529. tok := scan(t)
  530. #partial switch tok.kind {
  531. case .EOF:
  532. error(t, t.offset, "[skip_element] Premature EOF\n")
  533. return .Premature_EOF
  534. case .Lt:
  535. close += 1
  536. case .Gt:
  537. close -= 1
  538. if close == 0 {
  539. break loop
  540. }
  541. case:
  542. }
  543. }
  544. return .None
  545. }
  546. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  547. /*
  548. <!DOCTYPE greeting SYSTEM "hello.dtd">
  549. <!DOCTYPE greeting [
  550. <!ELEMENT greeting (#PCDATA)>
  551. ]>
  552. */
  553. assert(doc != nil)
  554. context.allocator = doc.allocator
  555. t := doc.tokenizer
  556. tok := expect(t, .Ident) or_return
  557. doc.doctype.ident = tok.text
  558. skip_whitespace(t)
  559. offset := t.offset
  560. skip_element(t) or_return
  561. /*
  562. -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  563. */
  564. doc.doctype.rest = string(t.src[offset : t.offset - 1])
  565. return .None
  566. }
  567. Element_ID :: u32
  568. new_element :: proc(doc: ^Document) -> (id: Element_ID) {
  569. element_space := len(doc.elements)
  570. // Need to resize
  571. if int(doc.element_count) + 1 > element_space {
  572. if element_space < 65536 {
  573. element_space *= 2
  574. } else {
  575. element_space += 65536
  576. }
  577. resize(&doc.elements, element_space)
  578. }
  579. cur := doc.element_count
  580. doc.element_count += 1
  581. return cur
  582. }