xml_reader.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. package xml
  2. /*
  3. An XML 1.0 / 1.1 parser
  4. Copyright 2021 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  7. Features:
  8. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  9. - Simple to understand and use. Small.
  10. Caveats:
  11. - We do NOT support HTML in this package, as that may or may not be valid XML.
  12. If it works, great. If it doesn't, that's not considered a bug.
  13. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  14. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  15. TODO:
  16. - Optional CDATA unboxing.
  17. - Optional `&gt;`, `&#32;`, `&#x20;` and other escape substitution in tag bodies.
  18. MAYBE:
  19. - XML writer?
  20. - Serialize/deserialize Odin types?
  21. List of contributors:
  22. Jeroen van Rijn: Initial implementation.
  23. */
  24. import "core:strings"
  25. import "core:mem"
  26. import "core:os"
  27. DEFAULT_Options :: Options{
  28. flags = {
  29. .Ignore_Unsupported,
  30. },
  31. expected_doctype = "",
  32. }
  33. Option_Flag :: enum {
  34. /*
  35. Document MUST start with `<?xml` prolog.
  36. */
  37. Must_Have_Prolog,
  38. /*
  39. Document MUST have a `<!DOCTYPE`.
  40. */
  41. Must_Have_DocType,
  42. /*
  43. By default we skip comments. Use this option to intern a comment on a parented Element.
  44. */
  45. Intern_Comments,
  46. /*
  47. How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  48. */
  49. Error_on_Unsupported,
  50. Ignore_Unsupported,
  51. /*
  52. By default CDATA tags are passed-through as-is.
  53. This option unwraps them when encountered.
  54. */
  55. Unbox_CDATA,
  56. /*
  57. By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  58. This option decodes them when encountered.
  59. */
  60. Decode_SGML_Entities,
  61. }
  62. Option_Flags :: bit_set[Option_Flag; u8]
  63. Document :: struct {
  64. root: ^Element,
  65. prolog: Attributes,
  66. encoding: Encoding,
  67. doctype: struct {
  68. /*
  69. We only scan the <!DOCTYPE IDENT part and skip the rest.
  70. */
  71. ident: string,
  72. rest: string,
  73. },
  74. /*
  75. If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  76. Otherwise they'll be in the element tree.
  77. */
  78. comments: [dynamic]string,
  79. /*
  80. Internal
  81. */
  82. tokenizer: ^Tokenizer,
  83. allocator: mem.Allocator,
  84. intern: strings.Intern,
  85. }
  86. Element :: struct {
  87. ident: string,
  88. value: string,
  89. attribs: Attributes,
  90. kind: enum {
  91. Element = 0,
  92. Comment,
  93. },
  94. parent: ^Element,
  95. children: [dynamic]^Element,
  96. }
  97. Attr :: struct {
  98. key: string,
  99. val: string,
  100. }
  101. Attributes :: [dynamic]Attr
  102. Options :: struct {
  103. flags: Option_Flags,
  104. expected_doctype: string,
  105. }
  106. Encoding :: enum {
  107. Unknown,
  108. UTF_8,
  109. ISO_8859_1,
  110. /*
  111. Aliases
  112. */
  113. LATIN_1 = ISO_8859_1,
  114. }
  115. Error :: enum {
  116. /*
  117. General return values.
  118. */
  119. None = 0,
  120. General_Error,
  121. Unexpected_Token,
  122. Invalid_Token,
  123. /*
  124. Couldn't find, open or read file.
  125. */
  126. File_Error,
  127. /*
  128. File too short.
  129. */
  130. Premature_EOF,
  131. /*
  132. XML-specific errors.
  133. */
  134. No_Prolog,
  135. Invalid_Prolog,
  136. Too_Many_Prologs,
  137. No_DocType,
  138. Too_Many_DocTypes,
  139. DocType_Must_Proceed_Elements,
  140. /*
  141. If a DOCTYPE is present _or_ the caller
  142. asked for a specific DOCTYPE and the DOCTYPE
  143. and root tag don't match, we return `.Invalid_DocType`.
  144. */
  145. Invalid_DocType,
  146. Invalid_Tag_Value,
  147. Mismatched_Closing_Tag,
  148. Unclosed_Comment,
  149. Comment_Before_Root_Element,
  150. Invalid_Sequence_In_Comment,
  151. Unsupported_Version,
  152. Unsupported_Encoding,
  153. /*
  154. <!FOO are usually skipped.
  155. */
  156. Unhandled_Bang,
  157. Duplicate_Attribute,
  158. Conflicting_Options,
  159. /*
  160. Unhandled TODO:
  161. */
  162. Unhandled_CDATA_Unboxing,
  163. Unhandled_SGML_Entity_Decoding,
  164. }
  165. /*
  166. Implementation starts here.
  167. */
  168. parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  169. context.allocator = allocator
  170. opts := validate_options(options) or_return
  171. t := &Tokenizer{}
  172. init(t, string(data), path, error_handler)
  173. doc = new(Document)
  174. doc.allocator = allocator
  175. doc.tokenizer = t
  176. strings.intern_init(&doc.intern, allocator, allocator)
  177. err = .Unexpected_Token
  178. element, parent: ^Element
  179. tag_is_open := false
  180. /*
  181. If a DOCTYPE is present, the root tag has to match.
  182. If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  183. */
  184. expected_doctype := options.expected_doctype
  185. loop: for {
  186. skip_whitespace(t)
  187. switch t.ch {
  188. case '<':
  189. /*
  190. Consume peeked `<`
  191. */
  192. advance_rune(t)
  193. open := scan(t)
  194. #partial switch open.kind {
  195. case .Question:
  196. /*
  197. <?xml
  198. */
  199. next := scan(t)
  200. #partial switch next.kind {
  201. case .Ident:
  202. if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
  203. parse_prolog(doc) or_return
  204. } else if len(doc.prolog) > 0 {
  205. /*
  206. We've already seen a prolog.
  207. */
  208. return doc, .Too_Many_Prologs
  209. } else {
  210. /*
  211. Could be `<?xml-stylesheet`, etc. Ignore it.
  212. */
  213. skip_element(t) or_return
  214. }
  215. case:
  216. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  217. return
  218. }
  219. case .Exclaim:
  220. /*
  221. <!
  222. */
  223. next := scan(t)
  224. #partial switch next.kind {
  225. case .Ident:
  226. switch next.text {
  227. case "DOCTYPE":
  228. if len(doc.doctype.ident) > 0 {
  229. return doc, .Too_Many_DocTypes
  230. }
  231. if doc.root != nil {
  232. return doc, .DocType_Must_Proceed_Elements
  233. }
  234. parse_doctype(doc) or_return
  235. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  236. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  237. return doc, .Invalid_DocType
  238. }
  239. expected_doctype = doc.doctype.ident
  240. case:
  241. if .Error_on_Unsupported in opts.flags {
  242. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  243. err = .Unhandled_Bang
  244. return
  245. }
  246. skip_element(t) or_return
  247. }
  248. case .Dash:
  249. /*
  250. Comment: <!-- -->.
  251. The grammar does not allow a comment to end in --->
  252. */
  253. expect(t, .Dash)
  254. comment := scan_comment(t) or_return
  255. if .Intern_Comments in opts.flags {
  256. comment = strings.intern_get(&doc.intern, comment)
  257. if doc.root == nil {
  258. append(&doc.comments, comment)
  259. } else {
  260. el := new(Element)
  261. el.parent = element
  262. el.kind = .Comment
  263. el.value = comment
  264. append(&element.children, el)
  265. }
  266. }
  267. case:
  268. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  269. return
  270. }
  271. case .Ident:
  272. /*
  273. e.g. <odin - Start of new element.
  274. */
  275. element = new(Element)
  276. tag_is_open = true
  277. if doc.root == nil {
  278. /*
  279. First element.
  280. */
  281. doc.root = element
  282. parent = element
  283. } else {
  284. append(&parent.children, element)
  285. }
  286. element.parent = parent
  287. element.ident = strings.intern_get(&doc.intern, open.text)
  288. parse_attributes(doc, &element.attribs) or_return
  289. /*
  290. If a DOCTYPE is present _or_ the caller
  291. asked for a specific DOCTYPE and the DOCTYPE
  292. and root tag don't match, we return .Invalid_Root_Tag.
  293. */
  294. if element == doc.root {
  295. if len(expected_doctype) > 0 && expected_doctype != open.text {
  296. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  297. return doc, .Invalid_DocType
  298. }
  299. }
  300. /*
  301. One of these should follow:
  302. - `>`, which means we've just opened this tag and expect a later element to close it.
  303. - `/>`, which means this is an 'empty' or self-closing tag.
  304. */
  305. end_token := scan(t)
  306. #partial switch end_token.kind {
  307. case .Gt:
  308. /*
  309. We're now the new parent.
  310. */
  311. parent = element
  312. case .Slash:
  313. /*
  314. Empty tag. Close it.
  315. */
  316. expect(t, .Gt) or_return
  317. parent = element.parent
  318. element = parent
  319. tag_is_open = false
  320. case:
  321. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  322. return
  323. }
  324. case .Slash:
  325. /*
  326. Close tag.
  327. */
  328. ident := expect(t, .Ident) or_return
  329. _ = expect(t, .Gt) or_return
  330. if element.ident != ident.text {
  331. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
  332. return doc, .Mismatched_Closing_Tag
  333. }
  334. parent = element.parent
  335. element = parent
  336. tag_is_open = false
  337. case:
  338. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  339. return
  340. }
  341. case -1:
  342. /*
  343. End of file.
  344. */
  345. if tag_is_open {
  346. return doc, .Premature_EOF
  347. }
  348. break loop
  349. case:
  350. /*
  351. This should be a tag's body text.
  352. */
  353. body_text := scan_string(t, t.offset) or_return
  354. element.value = strings.intern_get(&doc.intern, body_text)
  355. }
  356. }
  357. if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
  358. return doc, .No_Prolog
  359. }
  360. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  361. return doc, .No_DocType
  362. }
  363. return doc, .None
  364. }
  365. parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  366. context.allocator = allocator
  367. data, data_ok := os.read_entire_file(filename)
  368. defer delete(data)
  369. if !data_ok { return {}, .File_Error }
  370. return parse_from_slice(data, options, filename, error_handler, allocator)
  371. }
  372. parse :: proc { parse_from_file, parse_from_slice }
  373. free_element :: proc(element: ^Element) {
  374. if element == nil { return }
  375. for child in element.children {
  376. /*
  377. NOTE: Recursive.
  378. Could be rewritten so it adds them to a list of pointers to free.
  379. */
  380. free_element(child)
  381. }
  382. delete(element.attribs)
  383. delete(element.children)
  384. free(element)
  385. }
  386. destroy :: proc(doc: ^Document) {
  387. if doc == nil { return }
  388. free_element(doc.root)
  389. strings.intern_destroy(&doc.intern)
  390. delete(doc.prolog)
  391. delete(doc.comments)
  392. free(doc)
  393. }
  394. /*
  395. Helpers.
  396. */
  397. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  398. validated = options
  399. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  400. return options, .Conflicting_Options
  401. }
  402. if .Unbox_CDATA in validated.flags {
  403. return options, .Unhandled_CDATA_Unboxing
  404. }
  405. if .Decode_SGML_Entities in validated.flags {
  406. return options, .Unhandled_SGML_Entity_Decoding
  407. }
  408. return validated, .None
  409. }
  410. expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
  411. tok = scan(t)
  412. if tok.kind == kind { return tok, .None }
  413. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  414. return tok, .Unexpected_Token
  415. }
  416. parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
  417. assert(doc != nil)
  418. context.allocator = doc.allocator
  419. t := doc.tokenizer
  420. key := expect(t, .Ident) or_return
  421. offset = t.offset - len(key.text)
  422. _ = expect(t, .Eq) or_return
  423. value := expect(t, .String) or_return
  424. attr.key = strings.intern_get(&doc.intern, key.text)
  425. attr.val = strings.intern_get(&doc.intern, value.text)
  426. err = .None
  427. return
  428. }
  429. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
  430. for a in attribs {
  431. if attr.key == a.key {
  432. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  433. return .Duplicate_Attribute
  434. }
  435. }
  436. return .None
  437. }
  438. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  439. assert(doc != nil)
  440. context.allocator = doc.allocator
  441. t := doc.tokenizer
  442. for peek(t).kind == .Ident {
  443. attr, offset := parse_attribute(doc) or_return
  444. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  445. append(attribs, attr)
  446. }
  447. skip_whitespace(t)
  448. return .None
  449. }
  450. parse_prolog :: proc(doc: ^Document) -> (err: Error) {
  451. assert(doc != nil)
  452. context.allocator = doc.allocator
  453. t := doc.tokenizer
  454. offset := t.offset
  455. parse_attributes(doc, &doc.prolog) or_return
  456. for attr in doc.prolog {
  457. switch attr.key {
  458. case "version":
  459. switch attr.val {
  460. case "1.0", "1.1":
  461. case:
  462. error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
  463. }
  464. case "encoding":
  465. switch strings.to_lower(attr.val, context.temp_allocator) {
  466. case "utf-8", "utf8":
  467. doc.encoding = .UTF_8
  468. case "latin-1", "latin1", "iso-8859-1":
  469. doc.encoding = .LATIN_1
  470. case:
  471. /*
  472. Unrecognized encoding, assume UTF-8.
  473. */
  474. error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
  475. }
  476. case:
  477. // Ignored.
  478. }
  479. }
  480. _ = expect(t, .Question) or_return
  481. _ = expect(t, .Gt) or_return
  482. return .None
  483. }
  484. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  485. close := 1
  486. loop: for {
  487. tok := scan(t)
  488. #partial switch tok.kind {
  489. case .EOF:
  490. error(t, t.offset, "[skip_element] Premature EOF\n")
  491. return .Premature_EOF
  492. case .Lt:
  493. close += 1
  494. case .Gt:
  495. close -= 1
  496. if close == 0 {
  497. break loop
  498. }
  499. case:
  500. }
  501. }
  502. return .None
  503. }
  504. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  505. /*
  506. <!DOCTYPE greeting SYSTEM "hello.dtd">
  507. <!DOCTYPE greeting [
  508. <!ELEMENT greeting (#PCDATA)>
  509. ]>
  510. */
  511. assert(doc != nil)
  512. context.allocator = doc.allocator
  513. t := doc.tokenizer
  514. tok := expect(t, .Ident) or_return
  515. doc.doctype.ident = strings.intern_get(&doc.intern, tok.text)
  516. skip_whitespace(t)
  517. offset := t.offset
  518. skip_element(t) or_return
  519. /*
  520. -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  521. */
  522. doc.doctype.rest = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
  523. return .None
  524. }