xml_reader.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. /*
  2. XML 1.0 / 1.1 parser
  3. 2021-2022 Jeroen van Rijn <[email protected]>.
  4. available under Odin's BSD-3 license.
  5. from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
  6. Features:
  7. - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
  8. - Simple to understand and use. Small.
  9. Caveats:
  10. - We do NOT support HTML in this package, as that may or may not be valid XML.
  11. If it works, great. If it doesn't, that's not considered a bug.
  12. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
  13. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
  14. MAYBE:
  15. - XML writer?
  16. - Serialize/deserialize Odin types?
  17. List of contributors:
  18. - Jeroen van Rijn: Initial implementation.
  19. */
  20. package encoding_xml
  21. // An XML 1.0 / 1.1 parser
  22. import "core:bytes"
  23. import "core:encoding/entity"
  24. import "base:intrinsics"
  25. import "core:mem"
  26. import "core:os"
  27. import "core:strings"
  28. import "base:runtime"
  29. likely :: intrinsics.expect
  30. DEFAULT_OPTIONS :: Options{
  31. flags = {.Ignore_Unsupported},
  32. expected_doctype = "",
  33. }
  34. Option_Flag :: enum {
  35. // If the caller says that input may be modified, we can perform in-situ parsing.
  36. // If this flag isn't provided, the XML parser first duplicates the input so that it can.
  37. Input_May_Be_Modified,
  38. // Document MUST start with `<?xml` prologue.
  39. Must_Have_Prolog,
  40. // Document MUST have a `<!DOCTYPE`.
  41. Must_Have_DocType,
  42. // By default we skip comments. Use this option to intern a comment on a parented Element.
  43. Intern_Comments,
  44. // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  45. Error_on_Unsupported,
  46. Ignore_Unsupported,
  47. // By default CDATA tags are passed-through as-is.
  48. // This option unwraps them when encountered.
  49. Unbox_CDATA,
  50. // By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  51. // This option decodes them when encountered.
  52. Decode_SGML_Entities,
  53. // If a tag body has a comment, it will be stripped unless this option is given.
  54. Keep_Tag_Body_Comments,
  55. }
  56. Option_Flags :: bit_set[Option_Flag; u16]
  57. Document :: struct {
  58. elements: [dynamic]Element,
  59. element_count: Element_ID,
  60. prologue: Attributes,
  61. encoding: Encoding,
  62. doctype: struct {
  63. // We only scan the <!DOCTYPE IDENT part and skip the rest.
  64. ident: string,
  65. rest: string,
  66. },
  67. // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  68. // Otherwise they'll be in the element tree.
  69. comments: [dynamic]string,
  70. // Internal
  71. tokenizer: ^Tokenizer,
  72. allocator: mem.Allocator,
  73. // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
  74. input: []u8,
  75. strings_to_free: [dynamic]string,
  76. }
  77. Element :: struct {
  78. ident: string,
  79. value: [dynamic]Value,
  80. attribs: Attributes,
  81. kind: enum {
  82. Element = 0,
  83. Comment,
  84. },
  85. parent: Element_ID,
  86. }
  87. Value :: union {
  88. string,
  89. Element_ID,
  90. }
  91. Attribute :: struct {
  92. key: string,
  93. val: string,
  94. }
  95. Attributes :: [dynamic]Attribute
  96. Options :: struct {
  97. flags: Option_Flags,
  98. expected_doctype: string,
  99. }
  100. Encoding :: enum {
  101. Unknown,
  102. UTF_8,
  103. ISO_8859_1,
  104. // Aliases
  105. LATIN_1 = ISO_8859_1,
  106. }
  107. Error :: enum {
  108. // General return values.
  109. None = 0,
  110. General_Error,
  111. Unexpected_Token,
  112. Invalid_Token,
  113. // Couldn't find, open or read file.
  114. File_Error,
  115. // File too short.
  116. Premature_EOF,
  117. // XML-specific errors.
  118. No_Prolog,
  119. Invalid_Prolog,
  120. Too_Many_Prologs,
  121. No_DocType,
  122. Too_Many_DocTypes,
  123. DocType_Must_Preceed_Elements,
  124. // If a DOCTYPE is present _or_ the caller
  125. // asked for a specific DOCTYPE and the DOCTYPE
  126. // and root tag don't match, we return `.Invalid_DocType`.
  127. Invalid_DocType,
  128. Invalid_Tag_Value,
  129. Mismatched_Closing_Tag,
  130. Unclosed_Comment,
  131. Comment_Before_Root_Element,
  132. Invalid_Sequence_In_Comment,
  133. Unsupported_Version,
  134. Unsupported_Encoding,
  135. // <!FOO are usually skipped.
  136. Unhandled_Bang,
  137. Duplicate_Attribute,
  138. Conflicting_Options,
  139. }
  140. parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  141. data := data
  142. context.allocator = allocator
  143. opts := validate_options(options) or_return
  144. // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
  145. if .Input_May_Be_Modified not_in opts.flags {
  146. data = bytes.clone(data)
  147. }
  148. t := &Tokenizer{}
  149. init(t, string(data), path, error_handler)
  150. doc = new(Document)
  151. doc.allocator = allocator
  152. doc.tokenizer = t
  153. doc.input = data
  154. doc.elements = make([dynamic]Element, 1024, 1024, allocator)
  155. err = .Unexpected_Token
  156. element, parent: Element_ID
  157. open: Token
  158. // If a DOCTYPE is present, the root tag has to match.
  159. // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  160. expected_doctype := options.expected_doctype
  161. loop: for {
  162. skip_whitespace(t)
  163. // NOTE(Jeroen): This is faster as a switch.
  164. switch t.ch {
  165. case '<':
  166. // Consume peeked `<`
  167. advance_rune(t)
  168. open = scan(t)
  169. // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
  170. if likely(open.kind, Token_Kind.Ident) == .Ident {
  171. // e.g. <odin - Start of new element.
  172. element = new_element(doc)
  173. if element == 0 { // First Element
  174. parent = element
  175. } else {
  176. append(&doc.elements[parent].value, element)
  177. }
  178. doc.elements[element].parent = parent
  179. doc.elements[element].ident = open.text
  180. parse_attributes(doc, &doc.elements[element].attribs) or_return
  181. // If a DOCTYPE is present _or_ the caller
  182. // asked for a specific DOCTYPE and the DOCTYPE
  183. // and root tag don't match, we return .Invalid_Root_Tag.
  184. if element == 0 { // Root tag?
  185. if len(expected_doctype) > 0 && expected_doctype != open.text {
  186. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  187. return doc, .Invalid_DocType
  188. }
  189. }
  190. // One of these should follow:
  191. // - `>`, which means we've just opened this tag and expect a later element to close it.
  192. // - `/>`, which means this is an 'empty' or self-closing tag.
  193. end_token := scan(t)
  194. #partial switch end_token.kind {
  195. case .Gt:
  196. // We're now the new parent.
  197. parent = element
  198. case .Slash:
  199. // Empty tag. Close it.
  200. expect(t, .Gt) or_return
  201. parent = doc.elements[element].parent
  202. element = parent
  203. case:
  204. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  205. return
  206. }
  207. } else if open.kind == .Slash {
  208. // Close tag.
  209. ident := expect(t, .Ident) or_return
  210. _ = expect(t, .Gt) or_return
  211. if doc.elements[element].ident != ident.text {
  212. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
  213. return doc, .Mismatched_Closing_Tag
  214. }
  215. parent = doc.elements[element].parent
  216. element = parent
  217. } else if open.kind == .Exclaim {
  218. // <!
  219. next := scan(t)
  220. #partial switch next.kind {
  221. case .Ident:
  222. switch next.text {
  223. case "DOCTYPE":
  224. if len(doc.doctype.ident) > 0 {
  225. return doc, .Too_Many_DocTypes
  226. }
  227. if doc.element_count > 0 {
  228. return doc, .DocType_Must_Preceed_Elements
  229. }
  230. parse_doctype(doc) or_return
  231. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  232. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  233. return doc, .Invalid_DocType
  234. }
  235. expected_doctype = doc.doctype.ident
  236. case:
  237. if .Error_on_Unsupported in opts.flags {
  238. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  239. return doc, .Unhandled_Bang
  240. }
  241. skip_element(t) or_return
  242. }
  243. case .Dash:
  244. // Comment: <!-- -->.
  245. // The grammar does not allow a comment to end in --->
  246. expect(t, .Dash)
  247. comment := scan_comment(t) or_return
  248. if .Intern_Comments in opts.flags {
  249. if len(doc.elements) == 0 {
  250. append(&doc.comments, comment)
  251. } else {
  252. el := new_element(doc)
  253. doc.elements[el].parent = element
  254. doc.elements[el].kind = .Comment
  255. append(&doc.elements[el].value, comment)
  256. append(&doc.elements[element].value, el)
  257. }
  258. }
  259. case:
  260. error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
  261. return
  262. }
  263. } else if open.kind == .Question {
  264. // <?xml
  265. next := scan(t)
  266. #partial switch next.kind {
  267. case .Ident:
  268. if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
  269. parse_prologue(doc) or_return
  270. } else if len(doc.prologue) > 0 {
  271. // We've already seen a prologue.
  272. return doc, .Too_Many_Prologs
  273. } else {
  274. // Could be `<?xml-stylesheet`, etc. Ignore it.
  275. skip_element(t) or_return
  276. }
  277. case:
  278. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  279. return
  280. }
  281. } else {
  282. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  283. return
  284. }
  285. case -1:
  286. // End of file.
  287. break loop
  288. case:
  289. // This should be a tag's body text.
  290. body_text := scan_string(t, t.offset) or_return
  291. needs_processing := .Unbox_CDATA in opts.flags
  292. needs_processing |= .Decode_SGML_Entities in opts.flags
  293. if !needs_processing {
  294. append(&doc.elements[element].value, body_text)
  295. continue
  296. }
  297. decode_opts := entity.XML_Decode_Options{}
  298. if .Keep_Tag_Body_Comments not_in opts.flags {
  299. decode_opts += { .Comment_Strip }
  300. }
  301. if .Decode_SGML_Entities not_in opts.flags {
  302. decode_opts += { .No_Entity_Decode }
  303. }
  304. if .Unbox_CDATA in opts.flags {
  305. decode_opts += { .Unbox_CDATA }
  306. if .Decode_SGML_Entities in opts.flags {
  307. decode_opts += { .Decode_CDATA }
  308. }
  309. }
  310. decoded, decode_err := entity.decode_xml(body_text, decode_opts)
  311. if decode_err == .None {
  312. append(&doc.elements[element].value, decoded)
  313. append(&doc.strings_to_free, decoded)
  314. } else {
  315. append(&doc.elements[element].value, body_text)
  316. }
  317. }
  318. }
  319. if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 {
  320. return doc, .No_Prolog
  321. }
  322. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  323. return doc, .No_DocType
  324. }
  325. resize(&doc.elements, int(doc.element_count))
  326. return doc, .None
  327. }
  328. parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  329. _data := transmute([]u8)data
  330. return parse_bytes(_data, options, path, error_handler, allocator)
  331. }
  332. parse :: proc { parse_string, parse_bytes }
  333. // Load an XML file
  334. load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  335. context.allocator = allocator
  336. options := options
  337. data, data_ok := os.read_entire_file(filename)
  338. if !data_ok { return {}, .File_Error }
  339. options.flags += { .Input_May_Be_Modified }
  340. return parse_bytes(data, options, filename, error_handler, allocator)
  341. }
  342. destroy :: proc(doc: ^Document) {
  343. if doc == nil { return }
  344. for el in doc.elements {
  345. delete(el.attribs)
  346. delete(el.value)
  347. }
  348. delete(doc.elements)
  349. delete(doc.prologue)
  350. delete(doc.comments)
  351. delete(doc.input)
  352. for s in doc.strings_to_free {
  353. delete(s)
  354. }
  355. delete(doc.strings_to_free)
  356. free(doc)
  357. }
  358. /*
  359. Helpers.
  360. */
  361. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  362. validated = options
  363. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  364. return options, .Conflicting_Options
  365. }
  366. return validated, .None
  367. }
  368. expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
  369. tok = scan(t, multiline_string=multiline_string)
  370. if tok.kind == kind { return tok, .None }
  371. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  372. return tok, .Unexpected_Token
  373. }
  374. parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) {
  375. assert(doc != nil)
  376. context.allocator = doc.allocator
  377. t := doc.tokenizer
  378. key := expect(t, .Ident) or_return
  379. offset = t.offset - len(key.text)
  380. _ = expect(t, .Eq) or_return
  381. value := expect(t, .String, multiline_string=true) or_return
  382. normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
  383. if normalize_err == .None {
  384. append(&doc.strings_to_free, normalized)
  385. value.text = normalized
  386. }
  387. attr.key = key.text
  388. attr.val = value.text
  389. err = .None
  390. return
  391. }
  392. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) {
  393. for a in attribs {
  394. if attr.key == a.key {
  395. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  396. return .Duplicate_Attribute
  397. }
  398. }
  399. return .None
  400. }
  401. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  402. assert(doc != nil)
  403. context.allocator = doc.allocator
  404. t := doc.tokenizer
  405. for peek(t).kind == .Ident {
  406. attr, offset := parse_attribute(doc) or_return
  407. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  408. append(attribs, attr)
  409. }
  410. skip_whitespace(t)
  411. return .None
  412. }
  413. parse_prologue :: proc(doc: ^Document) -> (err: Error) {
  414. assert(doc != nil)
  415. context.allocator = doc.allocator
  416. t := doc.tokenizer
  417. offset := t.offset
  418. parse_attributes(doc, &doc.prologue) or_return
  419. for attr in doc.prologue {
  420. switch attr.key {
  421. case "version":
  422. switch attr.val {
  423. case "1.0", "1.1":
  424. case:
  425. error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val)
  426. }
  427. case "encoding":
  428. runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
  429. switch strings.to_lower(attr.val, context.temp_allocator) {
  430. case "utf-8", "utf8":
  431. doc.encoding = .UTF_8
  432. case "latin-1", "latin1", "iso-8859-1":
  433. doc.encoding = .LATIN_1
  434. case:
  435. // Unrecognized encoding, assume UTF-8.
  436. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
  437. }
  438. case:
  439. // Ignored.
  440. }
  441. }
  442. _ = expect(t, .Question) or_return
  443. _ = expect(t, .Gt) or_return
  444. return .None
  445. }
  446. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  447. close := 1
  448. loop: for {
  449. tok := scan(t)
  450. #partial switch tok.kind {
  451. case .EOF:
  452. error(t, t.offset, "[skip_element] Premature EOF\n")
  453. return .Premature_EOF
  454. case .Lt:
  455. close += 1
  456. case .Gt:
  457. close -= 1
  458. if close == 0 {
  459. break loop
  460. }
  461. case:
  462. }
  463. }
  464. return .None
  465. }
  466. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  467. /*
  468. <!DOCTYPE greeting SYSTEM "hello.dtd">
  469. <!DOCTYPE greeting [
  470. <!ELEMENT greeting (#PCDATA)>
  471. ]>
  472. */
  473. assert(doc != nil)
  474. context.allocator = doc.allocator
  475. t := doc.tokenizer
  476. tok := expect(t, .Ident) or_return
  477. doc.doctype.ident = tok.text
  478. skip_whitespace(t)
  479. offset := t.offset
  480. skip_element(t) or_return
  481. // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  482. doc.doctype.rest = string(t.src[offset : t.offset - 1])
  483. return .None
  484. }
  485. Element_ID :: u32
  486. new_element :: proc(doc: ^Document) -> (id: Element_ID) {
  487. element_space := len(doc.elements)
  488. // Need to resize
  489. if int(doc.element_count) + 1 > element_space {
  490. if element_space < 65536 {
  491. element_space *= 2
  492. } else {
  493. element_space += 65536
  494. }
  495. resize(&doc.elements, element_space)
  496. }
  497. cur := doc.element_count
  498. doc.element_count += 1
  499. return cur
  500. }