xml_reader.odin 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. /*
  2. An XML 1.0 / 1.1 parser
  3. Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  6. Features:
  7. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  8. - Simple to understand and use. Small.
  9. Caveats:
  10. - We do NOT support HTML in this package, as that may or may not be valid XML.
  11. If it works, great. If it doesn't, that's not considered a bug.
  12. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  13. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  14. MAYBE:
  15. - XML writer?
  16. - Serialize/deserialize Odin types?
  17. List of contributors:
  18. Jeroen van Rijn: Initial implementation.
  19. */
  20. package xml
  21. // An XML 1.0 / 1.1 parser
  22. import "core:bytes"
  23. import "core:encoding/entity"
  24. import "core:intrinsics"
  25. import "core:mem"
  26. import "core:os"
  27. import "core:strings"
  28. likely :: intrinsics.expect
  29. DEFAULT_OPTIONS :: Options{
  30. flags = {.Ignore_Unsupported},
  31. expected_doctype = "",
  32. }
  33. Option_Flag :: enum {
  34. /*
  35. If the caller says that input may be modified, we can perform in-situ parsing.
  36. If this flag isn't provided, the XML parser first duplicates the input so that it can.
  37. */
  38. Input_May_Be_Modified,
  39. /*
  40. Document MUST start with `<?xml` prologue.
  41. */
  42. Must_Have_Prolog,
  43. /*
  44. Document MUST have a `<!DOCTYPE`.
  45. */
  46. Must_Have_DocType,
  47. /*
  48. By default we skip comments. Use this option to intern a comment on a parented Element.
  49. */
  50. Intern_Comments,
  51. /*
  52. How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  53. */
  54. Error_on_Unsupported,
  55. Ignore_Unsupported,
  56. /*
  57. By default CDATA tags are passed-through as-is.
  58. This option unwraps them when encountered.
  59. */
  60. Unbox_CDATA,
  61. /*
  62. By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  63. This option decodes them when encountered.
  64. */
  65. Decode_SGML_Entities,
  66. /*
  67. If a tag body has a comment, it will be stripped unless this option is given.
  68. */
  69. Keep_Tag_Body_Comments,
  70. }
  71. Option_Flags :: bit_set[Option_Flag; u16]
  72. Document :: struct {
  73. elements: [dynamic]Element,
  74. element_count: Element_ID,
  75. prologue: Attributes,
  76. encoding: Encoding,
  77. doctype: struct {
  78. /*
  79. We only scan the <!DOCTYPE IDENT part and skip the rest.
  80. */
  81. ident: string,
  82. rest: string,
  83. },
  84. /*
  85. If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  86. Otherwise they'll be in the element tree.
  87. */
  88. comments: [dynamic]string,
  89. /*
  90. Internal
  91. */
  92. tokenizer: ^Tokenizer,
  93. allocator: mem.Allocator,
  94. /*
  95. Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
  96. */
  97. input: []u8,
  98. strings_to_free: [dynamic]string,
  99. }
  100. Element :: struct {
  101. ident: string,
  102. value: string,
  103. attribs: Attributes,
  104. kind: enum {
  105. Element = 0,
  106. Comment,
  107. },
  108. parent: Element_ID,
  109. children: [dynamic]Element_ID,
  110. }
  111. Attribute :: struct {
  112. key: string,
  113. val: string,
  114. }
  115. Attributes :: [dynamic]Attribute
  116. Options :: struct {
  117. flags: Option_Flags,
  118. expected_doctype: string,
  119. }
  120. Encoding :: enum {
  121. Unknown,
  122. UTF_8,
  123. ISO_8859_1,
  124. /*
  125. Aliases
  126. */
  127. LATIN_1 = ISO_8859_1,
  128. }
  129. Error :: enum {
  130. /*
  131. General return values.
  132. */
  133. None = 0,
  134. General_Error,
  135. Unexpected_Token,
  136. Invalid_Token,
  137. /*
  138. Couldn't find, open or read file.
  139. */
  140. File_Error,
  141. /*
  142. File too short.
  143. */
  144. Premature_EOF,
  145. /*
  146. XML-specific errors.
  147. */
  148. No_Prolog,
  149. Invalid_Prolog,
  150. Too_Many_Prologs,
  151. No_DocType,
  152. Too_Many_DocTypes,
  153. DocType_Must_Preceed_Elements,
  154. /*
  155. If a DOCTYPE is present _or_ the caller
  156. asked for a specific DOCTYPE and the DOCTYPE
  157. and root tag don't match, we return `.Invalid_DocType`.
  158. */
  159. Invalid_DocType,
  160. Invalid_Tag_Value,
  161. Mismatched_Closing_Tag,
  162. Unclosed_Comment,
  163. Comment_Before_Root_Element,
  164. Invalid_Sequence_In_Comment,
  165. Unsupported_Version,
  166. Unsupported_Encoding,
  167. /*
  168. <!FOO are usually skipped.
  169. */
  170. Unhandled_Bang,
  171. Duplicate_Attribute,
  172. Conflicting_Options,
  173. }
  174. /*
  175. Implementation starts here.
  176. */
  177. parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  178. data := data
  179. context.allocator = allocator
  180. opts := validate_options(options) or_return
  181. /*
  182. If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
  183. */
  184. if .Input_May_Be_Modified not_in opts.flags {
  185. data = bytes.clone(data)
  186. }
  187. t := &Tokenizer{}
  188. init(t, string(data), path, error_handler)
  189. doc = new(Document)
  190. doc.allocator = allocator
  191. doc.tokenizer = t
  192. doc.input = data
  193. doc.elements = make([dynamic]Element, 1024, 1024, allocator)
  194. // strings.intern_init(&doc.intern, allocator, allocator)
  195. err = .Unexpected_Token
  196. element, parent: Element_ID
  197. tag_is_open := false
  198. first_element := true
  199. open: Token
  200. /*
  201. If a DOCTYPE is present, the root tag has to match.
  202. If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  203. */
  204. expected_doctype := options.expected_doctype
  205. loop: for {
  206. skip_whitespace(t)
  207. // NOTE(Jeroen): This is faster as a switch.
  208. switch t.ch {
  209. case '<':
  210. /*
  211. Consume peeked `<`
  212. */
  213. advance_rune(t)
  214. open = scan(t)
  215. // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
  216. if likely(open.kind, Token_Kind.Ident) == .Ident {
  217. /*
  218. e.g. <odin - Start of new element.
  219. */
  220. element = new_element(doc)
  221. tag_is_open = true
  222. if first_element {
  223. /*
  224. First element.
  225. */
  226. parent = element
  227. first_element = false
  228. } else {
  229. append(&doc.elements[parent].children, element)
  230. }
  231. doc.elements[element].parent = parent
  232. doc.elements[element].ident = open.text
  233. parse_attributes(doc, &doc.elements[element].attribs) or_return
  234. /*
  235. If a DOCTYPE is present _or_ the caller
  236. asked for a specific DOCTYPE and the DOCTYPE
  237. and root tag don't match, we return .Invalid_Root_Tag.
  238. */
  239. if element == 0 { // Root tag?
  240. if len(expected_doctype) > 0 && expected_doctype != open.text {
  241. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  242. return doc, .Invalid_DocType
  243. }
  244. }
  245. /*
  246. One of these should follow:
  247. - `>`, which means we've just opened this tag and expect a later element to close it.
  248. - `/>`, which means this is an 'empty' or self-closing tag.
  249. */
  250. end_token := scan(t)
  251. #partial switch end_token.kind {
  252. case .Gt:
  253. /*
  254. We're now the new parent.
  255. */
  256. parent = element
  257. case .Slash:
  258. /*
  259. Empty tag. Close it.
  260. */
  261. expect(t, .Gt) or_return
  262. parent = doc.elements[element].parent
  263. element = parent
  264. tag_is_open = false
  265. case:
  266. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  267. return
  268. }
  269. } else if open.kind == .Slash {
  270. /*
  271. Close tag.
  272. */
  273. ident := expect(t, .Ident) or_return
  274. _ = expect(t, .Gt) or_return
  275. if doc.elements[element].ident != ident.text {
  276. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
  277. return doc, .Mismatched_Closing_Tag
  278. }
  279. parent = doc.elements[element].parent
  280. element = parent
  281. tag_is_open = false
  282. } else if open.kind == .Exclaim {
  283. /*
  284. <!
  285. */
  286. next := scan(t)
  287. #partial switch next.kind {
  288. case .Ident:
  289. switch next.text {
  290. case "DOCTYPE":
  291. if len(doc.doctype.ident) > 0 {
  292. return doc, .Too_Many_DocTypes
  293. }
  294. if doc.element_count > 0 {
  295. return doc, .DocType_Must_Preceed_Elements
  296. }
  297. parse_doctype(doc) or_return
  298. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  299. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  300. return doc, .Invalid_DocType
  301. }
  302. expected_doctype = doc.doctype.ident
  303. case:
  304. if .Error_on_Unsupported in opts.flags {
  305. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  306. return doc, .Unhandled_Bang
  307. }
  308. skip_element(t) or_return
  309. }
  310. case .Dash:
  311. /*
  312. Comment: <!-- -->.
  313. The grammar does not allow a comment to end in --->
  314. */
  315. expect(t, .Dash)
  316. comment := scan_comment(t) or_return
  317. if .Intern_Comments in opts.flags {
  318. if len(doc.elements) == 0 {
  319. append(&doc.comments, comment)
  320. } else {
  321. el := new_element(doc)
  322. doc.elements[el].parent = element
  323. doc.elements[el].kind = .Comment
  324. doc.elements[el].value = comment
  325. append(&doc.elements[element].children, el)
  326. }
  327. }
  328. case:
  329. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  330. return
  331. }
  332. } else if open.kind == .Question {
  333. /*
  334. <?xml
  335. */
  336. next := scan(t)
  337. #partial switch next.kind {
  338. case .Ident:
  339. if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
  340. parse_prologue(doc) or_return
  341. } else if len(doc.prologue) > 0 {
  342. /*
  343. We've already seen a prologue.
  344. */
  345. return doc, .Too_Many_Prologs
  346. } else {
  347. /*
  348. Could be `<?xml-stylesheet`, etc. Ignore it.
  349. */
  350. skip_element(t) or_return
  351. }
  352. case:
  353. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  354. return
  355. }
  356. } else {
  357. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  358. return
  359. }
  360. case -1:
  361. /*
  362. End of file.
  363. */
  364. if tag_is_open {
  365. return doc, .Premature_EOF
  366. }
  367. break loop
  368. case:
  369. /*
  370. This should be a tag's body text.
  371. */
  372. body_text := scan_string(t, t.offset) or_return
  373. needs_processing := .Unbox_CDATA in opts.flags
  374. needs_processing |= .Decode_SGML_Entities in opts.flags
  375. if !needs_processing {
  376. doc.elements[element].value = body_text
  377. continue
  378. }
  379. decode_opts := entity.XML_Decode_Options{}
  380. if .Keep_Tag_Body_Comments not_in opts.flags {
  381. decode_opts += { .Comment_Strip }
  382. }
  383. if .Decode_SGML_Entities not_in opts.flags {
  384. decode_opts += { .No_Entity_Decode }
  385. }
  386. if .Unbox_CDATA in opts.flags {
  387. decode_opts += { .Unbox_CDATA }
  388. if .Decode_SGML_Entities in opts.flags {
  389. decode_opts += { .Decode_CDATA }
  390. }
  391. }
  392. decoded, decode_err := entity.decode_xml(body_text, decode_opts)
  393. if decode_err == .None {
  394. doc.elements[element].value = decoded
  395. append(&doc.strings_to_free, decoded)
  396. } else {
  397. doc.elements[element].value = body_text
  398. }
  399. }
  400. }
  401. if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 {
  402. return doc, .No_Prolog
  403. }
  404. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  405. return doc, .No_DocType
  406. }
  407. resize(&doc.elements, int(doc.element_count))
  408. return doc, .None
  409. }
  410. parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  411. _data := transmute([]u8)data
  412. return parse_bytes(_data, options, path, error_handler, allocator)
  413. }
  414. parse :: proc { parse_string, parse_bytes }
  415. // Load an XML file
  416. load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  417. context.allocator = allocator
  418. options := options
  419. data, data_ok := os.read_entire_file(filename)
  420. if !data_ok { return {}, .File_Error }
  421. options.flags += { .Input_May_Be_Modified }
  422. return parse_bytes(data, options, filename, error_handler, allocator)
  423. }
  424. destroy :: proc(doc: ^Document) {
  425. if doc == nil { return }
  426. for el in doc.elements {
  427. delete(el.attribs)
  428. delete(el.children)
  429. }
  430. delete(doc.elements)
  431. delete(doc.prologue)
  432. delete(doc.comments)
  433. delete(doc.input)
  434. for s in doc.strings_to_free {
  435. delete(s)
  436. }
  437. delete(doc.strings_to_free)
  438. free(doc)
  439. }
  440. /*
  441. Helpers.
  442. */
  443. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  444. validated = options
  445. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  446. return options, .Conflicting_Options
  447. }
  448. return validated, .None
  449. }
  450. expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
  451. tok = scan(t)
  452. if tok.kind == kind { return tok, .None }
  453. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  454. return tok, .Unexpected_Token
  455. }
  456. parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) {
  457. assert(doc != nil)
  458. context.allocator = doc.allocator
  459. t := doc.tokenizer
  460. key := expect(t, .Ident) or_return
  461. offset = t.offset - len(key.text)
  462. _ = expect(t, .Eq) or_return
  463. value := expect(t, .String) or_return
  464. attr.key = key.text
  465. attr.val = value.text
  466. err = .None
  467. return
  468. }
  469. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) {
  470. for a in attribs {
  471. if attr.key == a.key {
  472. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  473. return .Duplicate_Attribute
  474. }
  475. }
  476. return .None
  477. }
  478. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  479. assert(doc != nil)
  480. context.allocator = doc.allocator
  481. t := doc.tokenizer
  482. for peek(t).kind == .Ident {
  483. attr, offset := parse_attribute(doc) or_return
  484. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  485. append(attribs, attr)
  486. }
  487. skip_whitespace(t)
  488. return .None
  489. }
  490. parse_prologue :: proc(doc: ^Document) -> (err: Error) {
  491. assert(doc != nil)
  492. context.allocator = doc.allocator
  493. t := doc.tokenizer
  494. offset := t.offset
  495. parse_attributes(doc, &doc.prologue) or_return
  496. for attr in doc.prologue {
  497. switch attr.key {
  498. case "version":
  499. switch attr.val {
  500. case "1.0", "1.1":
  501. case:
  502. error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val)
  503. }
  504. case "encoding":
  505. switch strings.to_lower(attr.val, context.temp_allocator) {
  506. case "utf-8", "utf8":
  507. doc.encoding = .UTF_8
  508. case "latin-1", "latin1", "iso-8859-1":
  509. doc.encoding = .LATIN_1
  510. case:
  511. /*
  512. Unrecognized encoding, assume UTF-8.
  513. */
  514. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
  515. }
  516. case:
  517. // Ignored.
  518. }
  519. }
  520. _ = expect(t, .Question) or_return
  521. _ = expect(t, .Gt) or_return
  522. return .None
  523. }
  524. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  525. close := 1
  526. loop: for {
  527. tok := scan(t)
  528. #partial switch tok.kind {
  529. case .EOF:
  530. error(t, t.offset, "[skip_element] Premature EOF\n")
  531. return .Premature_EOF
  532. case .Lt:
  533. close += 1
  534. case .Gt:
  535. close -= 1
  536. if close == 0 {
  537. break loop
  538. }
  539. case:
  540. }
  541. }
  542. return .None
  543. }
  544. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  545. /*
  546. <!DOCTYPE greeting SYSTEM "hello.dtd">
  547. <!DOCTYPE greeting [
  548. <!ELEMENT greeting (#PCDATA)>
  549. ]>
  550. */
  551. assert(doc != nil)
  552. context.allocator = doc.allocator
  553. t := doc.tokenizer
  554. tok := expect(t, .Ident) or_return
  555. doc.doctype.ident = tok.text
  556. skip_whitespace(t)
  557. offset := t.offset
  558. skip_element(t) or_return
  559. /*
  560. -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  561. */
  562. doc.doctype.rest = string(t.src[offset : t.offset - 1])
  563. return .None
  564. }
  565. Element_ID :: u32
  566. new_element :: proc(doc: ^Document) -> (id: Element_ID) {
  567. element_space := len(doc.elements)
  568. // Need to resize
  569. if int(doc.element_count) + 1 > element_space {
  570. if element_space < 65536 {
  571. element_space *= 2
  572. } else {
  573. element_space += 65536
  574. }
  575. resize(&doc.elements, element_space)
  576. }
  577. cur := doc.element_count
  578. doc.element_count += 1
  579. return cur
  580. }