xml_reader.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. /*
  2. XML 1.0 / 1.1 parser
  3. 2021-2022 Jeroen van Rijn <[email protected]>.
  4. available under Odin's BSD-3 license.
  5. from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  6. Features:
  7. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  8. - Simple to understand and use. Small.
  9. Caveats:
  10. - We do NOT support HTML in this package, as that may or may not be valid XML.
  11. If it works, great. If it doesn't, that's not considered a bug.
  12. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  13. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  14. MAYBE:
  15. - XML writer?
  16. - Serialize/deserialize Odin types?
  17. List of contributors:
  18. - Jeroen van Rijn: Initial implementation.
  19. */
  20. package xml
  21. // An XML 1.0 / 1.1 parser
  22. import "core:bytes"
  23. import "core:encoding/entity"
  24. import "base:intrinsics"
  25. import "core:mem"
  26. import "core:os"
  27. import "core:strings"
  28. import "base:runtime"
  29. likely :: intrinsics.expect
  30. DEFAULT_OPTIONS :: Options{
  31. flags = {.Ignore_Unsupported},
  32. expected_doctype = "",
  33. }
  34. Option_Flag :: enum {
  35. // If the caller says that input may be modified, we can perform in-situ parsing.
  36. // If this flag isn't provided, the XML parser first duplicates the input so that it can.
  37. Input_May_Be_Modified,
  38. // Document MUST start with `<?xml` prologue.
  39. Must_Have_Prolog,
  40. // Document MUST have a `<!DOCTYPE`.
  41. Must_Have_DocType,
  42. // By default we skip comments. Use this option to intern a comment on a parented Element.
  43. Intern_Comments,
  44. // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  45. Error_on_Unsupported,
  46. Ignore_Unsupported,
  47. // By default CDATA tags are passed-through as-is.
  48. // This option unwraps them when encountered.
  49. Unbox_CDATA,
  50. // By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  51. // This option decodes them when encountered.
  52. Decode_SGML_Entities,
  53. // If a tag body has a comment, it will be stripped unless this option is given.
  54. Keep_Tag_Body_Comments,
  55. }
  56. Option_Flags :: bit_set[Option_Flag; u16]
  57. Document :: struct {
  58. elements: [dynamic]Element,
  59. element_count: Element_ID,
  60. prologue: Attributes,
  61. encoding: Encoding,
  62. doctype: struct {
  63. // We only scan the <!DOCTYPE IDENT part and skip the rest.
  64. ident: string,
  65. rest: string,
  66. },
  67. // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  68. // Otherwise they'll be in the element tree.
  69. comments: [dynamic]string,
  70. // Internal
  71. tokenizer: ^Tokenizer,
  72. allocator: mem.Allocator,
  73. // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
  74. input: []u8,
  75. strings_to_free: [dynamic]string,
  76. }
  77. Element :: struct {
  78. ident: string,
  79. value: [dynamic]Value,
  80. attribs: Attributes,
  81. kind: enum {
  82. Element = 0,
  83. Comment,
  84. },
  85. parent: Element_ID,
  86. }
  87. Value :: union {
  88. string,
  89. Element_ID,
  90. }
  91. Attribute :: struct {
  92. key: string,
  93. val: string,
  94. }
  95. Attributes :: [dynamic]Attribute
  96. Options :: struct {
  97. flags: Option_Flags,
  98. expected_doctype: string,
  99. }
  100. Encoding :: enum {
  101. Unknown,
  102. UTF_8,
  103. ISO_8859_1,
  104. // Aliases
  105. LATIN_1 = ISO_8859_1,
  106. }
  107. Error :: enum {
  108. // General return values.
  109. None = 0,
  110. General_Error,
  111. Unexpected_Token,
  112. Invalid_Token,
  113. // Couldn't find, open or read file.
  114. File_Error,
  115. // File too short.
  116. Premature_EOF,
  117. // XML-specific errors.
  118. No_Prolog,
  119. Invalid_Prolog,
  120. Too_Many_Prologs,
  121. No_DocType,
  122. Too_Many_DocTypes,
  123. DocType_Must_Preceed_Elements,
  124. // If a DOCTYPE is present _or_ the caller
  125. // asked for a specific DOCTYPE and the DOCTYPE
  126. // and root tag don't match, we return `.Invalid_DocType`.
  127. Invalid_DocType,
  128. Invalid_Tag_Value,
  129. Mismatched_Closing_Tag,
  130. Unclosed_Comment,
  131. Comment_Before_Root_Element,
  132. Invalid_Sequence_In_Comment,
  133. Unsupported_Version,
  134. Unsupported_Encoding,
  135. // <!FOO are usually skipped.
  136. Unhandled_Bang,
  137. Duplicate_Attribute,
  138. Conflicting_Options,
  139. }
  140. parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  141. data := data
  142. context.allocator = allocator
  143. opts := validate_options(options) or_return
  144. // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
  145. if .Input_May_Be_Modified not_in opts.flags {
  146. data = bytes.clone(data)
  147. }
  148. t := &Tokenizer{}
  149. init(t, string(data), path, error_handler)
  150. doc = new(Document)
  151. doc.allocator = allocator
  152. doc.tokenizer = t
  153. doc.input = data
  154. doc.elements = make([dynamic]Element, 1024, 1024, allocator)
  155. // strings.intern_init(&doc.intern, allocator, allocator)
  156. err = .Unexpected_Token
  157. element, parent: Element_ID
  158. open: Token
  159. // If a DOCTYPE is present, the root tag has to match.
  160. // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  161. expected_doctype := options.expected_doctype
  162. loop: for {
  163. skip_whitespace(t)
  164. // NOTE(Jeroen): This is faster as a switch.
  165. switch t.ch {
  166. case '<':
  167. // Consume peeked `<`
  168. advance_rune(t)
  169. open = scan(t)
  170. // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
  171. if likely(open.kind, Token_Kind.Ident) == .Ident {
  172. // e.g. <odin - Start of new element.
  173. element = new_element(doc)
  174. if element == 0 { // First Element
  175. parent = element
  176. } else {
  177. append(&doc.elements[parent].value, element)
  178. }
  179. doc.elements[element].parent = parent
  180. doc.elements[element].ident = open.text
  181. parse_attributes(doc, &doc.elements[element].attribs) or_return
  182. // If a DOCTYPE is present _or_ the caller
  183. // asked for a specific DOCTYPE and the DOCTYPE
  184. // and root tag don't match, we return .Invalid_Root_Tag.
  185. if element == 0 { // Root tag?
  186. if len(expected_doctype) > 0 && expected_doctype != open.text {
  187. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  188. return doc, .Invalid_DocType
  189. }
  190. }
  191. // One of these should follow:
  192. // - `>`, which means we've just opened this tag and expect a later element to close it.
  193. // - `/>`, which means this is an 'empty' or self-closing tag.
  194. end_token := scan(t)
  195. #partial switch end_token.kind {
  196. case .Gt:
  197. // We're now the new parent.
  198. parent = element
  199. case .Slash:
  200. // Empty tag. Close it.
  201. expect(t, .Gt) or_return
  202. parent = doc.elements[element].parent
  203. element = parent
  204. case:
  205. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  206. return
  207. }
  208. } else if open.kind == .Slash {
  209. // Close tag.
  210. ident := expect(t, .Ident) or_return
  211. _ = expect(t, .Gt) or_return
  212. if doc.elements[element].ident != ident.text {
  213. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
  214. return doc, .Mismatched_Closing_Tag
  215. }
  216. parent = doc.elements[element].parent
  217. element = parent
  218. } else if open.kind == .Exclaim {
  219. // <!
  220. next := scan(t)
  221. #partial switch next.kind {
  222. case .Ident:
  223. switch next.text {
  224. case "DOCTYPE":
  225. if len(doc.doctype.ident) > 0 {
  226. return doc, .Too_Many_DocTypes
  227. }
  228. if doc.element_count > 0 {
  229. return doc, .DocType_Must_Preceed_Elements
  230. }
  231. parse_doctype(doc) or_return
  232. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  233. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  234. return doc, .Invalid_DocType
  235. }
  236. expected_doctype = doc.doctype.ident
  237. case:
  238. if .Error_on_Unsupported in opts.flags {
  239. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  240. return doc, .Unhandled_Bang
  241. }
  242. skip_element(t) or_return
  243. }
  244. case .Dash:
  245. // Comment: <!-- -->.
  246. // The grammar does not allow a comment to end in --->
  247. expect(t, .Dash)
  248. comment := scan_comment(t) or_return
  249. if .Intern_Comments in opts.flags {
  250. if len(doc.elements) == 0 {
  251. append(&doc.comments, comment)
  252. } else {
  253. el := new_element(doc)
  254. doc.elements[el].parent = element
  255. doc.elements[el].kind = .Comment
  256. append(&doc.elements[el].value, comment)
  257. append(&doc.elements[element].value, el)
  258. }
  259. }
  260. case:
  261. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  262. return
  263. }
  264. } else if open.kind == .Question {
  265. // <?xml
  266. next := scan(t)
  267. #partial switch next.kind {
  268. case .Ident:
  269. if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
  270. parse_prologue(doc) or_return
  271. } else if len(doc.prologue) > 0 {
  272. // We've already seen a prologue.
  273. return doc, .Too_Many_Prologs
  274. } else {
  275. // Could be `<?xml-stylesheet`, etc. Ignore it.
  276. skip_element(t) or_return
  277. }
  278. case:
  279. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  280. return
  281. }
  282. } else {
  283. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  284. return
  285. }
  286. case -1:
  287. // End of file.
  288. break loop
  289. case:
  290. // This should be a tag's body text.
  291. body_text := scan_string(t, t.offset) or_return
  292. needs_processing := .Unbox_CDATA in opts.flags
  293. needs_processing |= .Decode_SGML_Entities in opts.flags
  294. if !needs_processing {
  295. append(&doc.elements[element].value, body_text)
  296. continue
  297. }
  298. decode_opts := entity.XML_Decode_Options{}
  299. if .Keep_Tag_Body_Comments not_in opts.flags {
  300. decode_opts += { .Comment_Strip }
  301. }
  302. if .Decode_SGML_Entities not_in opts.flags {
  303. decode_opts += { .No_Entity_Decode }
  304. }
  305. if .Unbox_CDATA in opts.flags {
  306. decode_opts += { .Unbox_CDATA }
  307. if .Decode_SGML_Entities in opts.flags {
  308. decode_opts += { .Decode_CDATA }
  309. }
  310. }
  311. decoded, decode_err := entity.decode_xml(body_text, decode_opts)
  312. if decode_err == .None {
  313. append(&doc.elements[element].value, decoded)
  314. append(&doc.strings_to_free, decoded)
  315. } else {
  316. append(&doc.elements[element].value, body_text)
  317. }
  318. }
  319. }
  320. if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 {
  321. return doc, .No_Prolog
  322. }
  323. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  324. return doc, .No_DocType
  325. }
  326. resize(&doc.elements, int(doc.element_count))
  327. return doc, .None
  328. }
  329. parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  330. _data := transmute([]u8)data
  331. return parse_bytes(_data, options, path, error_handler, allocator)
  332. }
  333. parse :: proc { parse_string, parse_bytes }
  334. // Load an XML file
  335. load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  336. context.allocator = allocator
  337. options := options
  338. data, data_ok := os.read_entire_file(filename)
  339. if !data_ok { return {}, .File_Error }
  340. options.flags += { .Input_May_Be_Modified }
  341. return parse_bytes(data, options, filename, error_handler, allocator)
  342. }
  343. destroy :: proc(doc: ^Document) {
  344. if doc == nil { return }
  345. for el in doc.elements {
  346. delete(el.attribs)
  347. delete(el.value)
  348. }
  349. delete(doc.elements)
  350. delete(doc.prologue)
  351. delete(doc.comments)
  352. delete(doc.input)
  353. for s in doc.strings_to_free {
  354. delete(s)
  355. }
  356. delete(doc.strings_to_free)
  357. free(doc)
  358. }
  359. /*
  360. Helpers.
  361. */
  362. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  363. validated = options
  364. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  365. return options, .Conflicting_Options
  366. }
  367. return validated, .None
  368. }
  369. expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
  370. tok = scan(t)
  371. if tok.kind == kind { return tok, .None }
  372. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  373. return tok, .Unexpected_Token
  374. }
  375. parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) {
  376. assert(doc != nil)
  377. context.allocator = doc.allocator
  378. t := doc.tokenizer
  379. key := expect(t, .Ident) or_return
  380. offset = t.offset - len(key.text)
  381. _ = expect(t, .Eq) or_return
  382. value := expect(t, .String) or_return
  383. attr.key = key.text
  384. attr.val = value.text
  385. err = .None
  386. return
  387. }
  388. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) {
  389. for a in attribs {
  390. if attr.key == a.key {
  391. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  392. return .Duplicate_Attribute
  393. }
  394. }
  395. return .None
  396. }
  397. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  398. assert(doc != nil)
  399. context.allocator = doc.allocator
  400. t := doc.tokenizer
  401. for peek(t).kind == .Ident {
  402. attr, offset := parse_attribute(doc) or_return
  403. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  404. append(attribs, attr)
  405. }
  406. skip_whitespace(t)
  407. return .None
  408. }
  409. parse_prologue :: proc(doc: ^Document) -> (err: Error) {
  410. assert(doc != nil)
  411. context.allocator = doc.allocator
  412. t := doc.tokenizer
  413. offset := t.offset
  414. parse_attributes(doc, &doc.prologue) or_return
  415. for attr in doc.prologue {
  416. switch attr.key {
  417. case "version":
  418. switch attr.val {
  419. case "1.0", "1.1":
  420. case:
  421. error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val)
  422. }
  423. case "encoding":
  424. runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
  425. switch strings.to_lower(attr.val, context.temp_allocator) {
  426. case "utf-8", "utf8":
  427. doc.encoding = .UTF_8
  428. case "latin-1", "latin1", "iso-8859-1":
  429. doc.encoding = .LATIN_1
  430. case:
  431. // Unrecognized encoding, assume UTF-8.
  432. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
  433. }
  434. case:
  435. // Ignored.
  436. }
  437. }
  438. _ = expect(t, .Question) or_return
  439. _ = expect(t, .Gt) or_return
  440. return .None
  441. }
  442. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  443. close := 1
  444. loop: for {
  445. tok := scan(t)
  446. #partial switch tok.kind {
  447. case .EOF:
  448. error(t, t.offset, "[skip_element] Premature EOF\n")
  449. return .Premature_EOF
  450. case .Lt:
  451. close += 1
  452. case .Gt:
  453. close -= 1
  454. if close == 0 {
  455. break loop
  456. }
  457. case:
  458. }
  459. }
  460. return .None
  461. }
  462. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  463. /*
  464. <!DOCTYPE greeting SYSTEM "hello.dtd">
  465. <!DOCTYPE greeting [
  466. <!ELEMENT greeting (#PCDATA)>
  467. ]>
  468. */
  469. assert(doc != nil)
  470. context.allocator = doc.allocator
  471. t := doc.tokenizer
  472. tok := expect(t, .Ident) or_return
  473. doc.doctype.ident = tok.text
  474. skip_whitespace(t)
  475. offset := t.offset
  476. skip_element(t) or_return
  477. // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  478. doc.doctype.rest = string(t.src[offset : t.offset - 1])
  479. return .None
  480. }
  481. Element_ID :: u32
  482. new_element :: proc(doc: ^Document) -> (id: Element_ID) {
  483. element_space := len(doc.elements)
  484. // Need to resize
  485. if int(doc.element_count) + 1 > element_space {
  486. if element_space < 65536 {
  487. element_space *= 2
  488. } else {
  489. element_space += 65536
  490. }
  491. resize(&doc.elements, element_space)
  492. }
  493. cur := doc.element_count
  494. doc.element_count += 1
  495. return cur
  496. }