xml_reader.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. /*
  2. 2021-2022 Jeroen van Rijn <[email protected]>.
  3. available under Odin's BSD-3 license.
  4. List of contributors:
  5. - Jeroen van Rijn: Initial implementation.
  6. */
  7. package encoding_xml
  8. // An XML 1.0 / 1.1 parser
  9. import "core:bytes"
  10. import "core:encoding/entity"
  11. import "base:intrinsics"
  12. import "core:mem"
  13. import "core:os"
  14. import "core:strings"
  15. import "base:runtime"
  16. likely :: intrinsics.expect
  17. DEFAULT_OPTIONS :: Options{
  18. flags = {.Ignore_Unsupported},
  19. expected_doctype = "",
  20. }
  21. Option_Flag :: enum {
  22. // If the caller says that input may be modified, we can perform in-situ parsing.
  23. // If this flag isn't provided, the XML parser first duplicates the input so that it can.
  24. Input_May_Be_Modified,
  25. // Document MUST start with `<?xml` prologue.
  26. Must_Have_Prolog,
  27. // Document MUST have a `<!DOCTYPE`.
  28. Must_Have_DocType,
  29. // By default we skip comments. Use this option to intern a comment on a parented Element.
  30. Intern_Comments,
  31. // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
  32. Error_on_Unsupported,
  33. Ignore_Unsupported,
  34. // By default CDATA tags are passed-through as-is.
  35. // This option unwraps them when encountered.
  36. Unbox_CDATA,
  37. // By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
  38. // This option decodes them when encountered.
  39. Decode_SGML_Entities,
  40. // If a tag body has a comment, it will be stripped unless this option is given.
  41. Keep_Tag_Body_Comments,
  42. }
  43. Option_Flags :: bit_set[Option_Flag; u16]
  44. Document :: struct {
  45. elements: [dynamic]Element `fmt:"v,element_count"`,
  46. element_count: Element_ID,
  47. prologue: Attributes,
  48. encoding: Encoding,
  49. doctype: struct {
  50. // We only scan the <!DOCTYPE IDENT part and skip the rest.
  51. ident: string,
  52. rest: string,
  53. },
  54. // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
  55. // Otherwise they'll be in the element tree.
  56. comments: [dynamic]string `fmt:"-"`,
  57. // Internal
  58. tokenizer: ^Tokenizer `fmt:"-"`,
  59. allocator: mem.Allocator `fmt:"-"`,
  60. // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
  61. input: []u8 `fmt:"-"`,
  62. strings_to_free: [dynamic]string `fmt:"-"`,
  63. }
  64. Element :: struct {
  65. ident: string,
  66. value: [dynamic]Value,
  67. attribs: Attributes,
  68. kind: enum {
  69. Element = 0,
  70. Comment,
  71. },
  72. parent: Element_ID,
  73. }
  74. Value :: union {
  75. string,
  76. Element_ID,
  77. }
  78. Attribute :: struct {
  79. key: string,
  80. val: string,
  81. }
  82. Attributes :: [dynamic]Attribute
  83. Options :: struct {
  84. flags: Option_Flags,
  85. expected_doctype: string,
  86. }
  87. Encoding :: enum {
  88. Unknown,
  89. UTF_8,
  90. ISO_8859_1,
  91. // Aliases
  92. LATIN_1 = ISO_8859_1,
  93. }
  94. Error :: enum {
  95. // General return values.
  96. None = 0,
  97. General_Error,
  98. Unexpected_Token,
  99. Invalid_Token,
  100. // Couldn't find, open or read file.
  101. File_Error,
  102. // File too short.
  103. Premature_EOF,
  104. // XML-specific errors.
  105. No_Prolog,
  106. Invalid_Prolog,
  107. Too_Many_Prologs,
  108. No_DocType,
  109. Too_Many_DocTypes,
  110. DocType_Must_Preceed_Elements,
  111. // If a DOCTYPE is present _or_ the caller
  112. // asked for a specific DOCTYPE and the DOCTYPE
  113. // and root tag don't match, we return `.Invalid_DocType`.
  114. Invalid_DocType,
  115. Invalid_Tag_Value,
  116. Mismatched_Closing_Tag,
  117. Unclosed_Comment,
  118. Comment_Before_Root_Element,
  119. Invalid_Sequence_In_Comment,
  120. Unsupported_Version,
  121. Unsupported_Encoding,
  122. // <!FOO are usually skipped.
  123. Unhandled_Bang,
  124. Duplicate_Attribute,
  125. Conflicting_Options,
  126. }
  127. parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  128. data := data
  129. context.allocator = allocator
  130. opts := validate_options(options) or_return
  131. // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
  132. if .Input_May_Be_Modified not_in opts.flags {
  133. data = bytes.clone(data)
  134. }
  135. t := new(Tokenizer)
  136. init(t, string(data), path, error_handler)
  137. doc = new(Document)
  138. doc.allocator = allocator
  139. doc.tokenizer = t
  140. doc.input = data
  141. doc.elements = make([dynamic]Element, 1024, 1024, allocator)
  142. err = .Unexpected_Token
  143. element, parent: Element_ID
  144. open: Token
  145. // If a DOCTYPE is present, the root tag has to match.
  146. // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
  147. expected_doctype := options.expected_doctype
  148. loop: for {
  149. skip_whitespace(t)
  150. switch t.ch {
  151. case '<':
  152. // Consume peeked `<`
  153. advance_rune(t)
  154. open = scan(t)
  155. // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
  156. if likely(open.kind, Token_Kind.Ident) == .Ident {
  157. // e.g. <odin - Start of new element.
  158. element = new_element(doc)
  159. if element == 0 { // First Element
  160. parent = element
  161. } else {
  162. append(&doc.elements[parent].value, element)
  163. }
  164. doc.elements[element].parent = parent
  165. doc.elements[element].ident = open.text
  166. parse_attributes(doc, &doc.elements[element].attribs) or_return
  167. // If a DOCTYPE is present _or_ the caller
  168. // asked for a specific DOCTYPE and the DOCTYPE
  169. // and root tag don't match, we return .Invalid_Root_Tag.
  170. if element == 0 { // Root tag?
  171. if len(expected_doctype) > 0 && expected_doctype != open.text {
  172. error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
  173. return doc, .Invalid_DocType
  174. }
  175. }
  176. // One of these should follow:
  177. // - `>`, which means we've just opened this tag and expect a later element to close it.
  178. // - `/>`, which means this is an 'empty' or self-closing tag.
  179. end_token := scan(t)
  180. #partial switch end_token.kind {
  181. case .Gt:
  182. // We're now the new parent.
  183. parent = element
  184. case .Slash:
  185. // Empty tag. Close it.
  186. expect(t, .Gt) or_return
  187. parent = doc.elements[element].parent
  188. element = parent
  189. case:
  190. error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
  191. return
  192. }
  193. } else if open.kind == .Slash {
  194. // Close tag.
  195. ident := expect(t, .Ident) or_return
  196. _ = expect(t, .Gt) or_return
  197. if doc.elements[element].ident != ident.text {
  198. error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
  199. return doc, .Mismatched_Closing_Tag
  200. }
  201. parent = doc.elements[element].parent
  202. element = parent
  203. } else if open.kind == .Exclaim {
  204. // <!
  205. next := scan(t)
  206. #partial switch next.kind {
  207. case .Ident:
  208. switch next.text {
  209. case "DOCTYPE":
  210. if len(doc.doctype.ident) > 0 {
  211. return doc, .Too_Many_DocTypes
  212. }
  213. if doc.element_count > 0 {
  214. return doc, .DocType_Must_Preceed_Elements
  215. }
  216. parse_doctype(doc) or_return
  217. if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
  218. error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
  219. return doc, .Invalid_DocType
  220. }
  221. expected_doctype = doc.doctype.ident
  222. case:
  223. if .Error_on_Unsupported in opts.flags {
  224. error(t, t.offset, "Unhandled: <!%v\n", next.text)
  225. return doc, .Unhandled_Bang
  226. }
  227. skip_element(t) or_return
  228. }
  229. case .Dash:
  230. // Comment: <!-- -->.
  231. // The grammar does not allow a comment to end in --->
  232. expect(t, .Dash)
  233. comment := scan_comment(t) or_return
  234. if .Intern_Comments in opts.flags {
  235. if len(doc.elements) == 0 {
  236. append(&doc.comments, comment)
  237. } else {
  238. el := new_element(doc)
  239. doc.elements[el].parent = element
  240. doc.elements[el].kind = .Comment
  241. append(&doc.elements[el].value, comment)
  242. append(&doc.elements[element].value, el)
  243. }
  244. }
  245. case .Open_Bracket:
  246. // This could be a CDATA tag part of a tag's body. Unread the `<![`
  247. t.offset -= 3
  248. // Instead of calling `parse_body` here, we could also `continue loop`
  249. // and fall through to the `case:` at the bottom of the outer loop.
  250. // This makes the intent clearer.
  251. parse_body(doc, element, opts) or_return
  252. case:
  253. error(t, t.offset, "Unexpected Token after <!: %#v", next)
  254. }
  255. } else if open.kind == .Question {
  256. // <?xml
  257. next := scan(t)
  258. #partial switch next.kind {
  259. case .Ident:
  260. if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
  261. parse_prologue(doc) or_return
  262. } else if len(doc.prologue) > 0 {
  263. // We've already seen a prologue.
  264. return doc, .Too_Many_Prologs
  265. } else {
  266. // Could be `<?xml-stylesheet`, etc. Ignore it.
  267. skip_element(t) or_return
  268. }
  269. case:
  270. error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
  271. return
  272. }
  273. } else {
  274. error(t, t.offset, "Invalid Token after <: %#v\n", open)
  275. return
  276. }
  277. case -1:
  278. // End of file.
  279. break loop
  280. case:
  281. // This should be a tag's body text.
  282. parse_body(doc, element, opts) or_return
  283. }
  284. }
  285. if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 {
  286. return doc, .No_Prolog
  287. }
  288. if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
  289. return doc, .No_DocType
  290. }
  291. resize(&doc.elements, int(doc.element_count))
  292. return doc, .None
  293. }
  294. parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  295. _data := transmute([]u8)data
  296. return parse_bytes(_data, options, path, error_handler, allocator)
  297. }
  298. parse :: proc { parse_string, parse_bytes }
  299. // Load an XML file
  300. load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
  301. context.allocator = allocator
  302. options := options
  303. data, data_ok := os.read_entire_file(filename)
  304. if !data_ok { return {}, .File_Error }
  305. options.flags += { .Input_May_Be_Modified }
  306. return parse_bytes(data, options, filename, error_handler, allocator)
  307. }
  308. destroy :: proc(doc: ^Document) {
  309. if doc == nil { return }
  310. for el in doc.elements {
  311. delete(el.attribs)
  312. delete(el.value)
  313. }
  314. delete(doc.elements)
  315. delete(doc.prologue)
  316. delete(doc.comments)
  317. delete(doc.input)
  318. for s in doc.strings_to_free {
  319. delete(s)
  320. }
  321. delete(doc.strings_to_free)
  322. free(doc.tokenizer)
  323. free(doc)
  324. }
  325. /*
  326. Helpers.
  327. */
  328. validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
  329. validated = options
  330. if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
  331. return options, .Conflicting_Options
  332. }
  333. return validated, .None
  334. }
  335. expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
  336. tok = scan(t, multiline_string=multiline_string)
  337. if tok.kind == kind { return tok, .None }
  338. error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
  339. return tok, .Unexpected_Token
  340. }
  341. parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) {
  342. assert(doc != nil)
  343. context.allocator = doc.allocator
  344. t := doc.tokenizer
  345. key := expect(t, .Ident) or_return
  346. _ = expect(t, .Eq) or_return
  347. value := expect(t, .String, multiline_string=true) or_return
  348. normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
  349. if normalize_err == .None {
  350. append(&doc.strings_to_free, normalized)
  351. value.text = normalized
  352. }
  353. attr.key = key.text
  354. attr.val = value.text
  355. err = .None
  356. return
  357. }
  358. check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) {
  359. for a in attribs {
  360. if attr.key == a.key {
  361. error(t, offset, "Duplicate attribute: %v\n", attr.key)
  362. return .Duplicate_Attribute
  363. }
  364. }
  365. return .None
  366. }
  367. parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
  368. assert(doc != nil)
  369. context.allocator = doc.allocator
  370. t := doc.tokenizer
  371. for peek(t).kind == .Ident {
  372. attr, offset := parse_attribute(doc) or_return
  373. check_duplicate_attributes(t, attribs^, attr, offset) or_return
  374. append(attribs, attr)
  375. }
  376. skip_whitespace(t)
  377. return .None
  378. }
  379. parse_prologue :: proc(doc: ^Document) -> (err: Error) {
  380. assert(doc != nil)
  381. context.allocator = doc.allocator
  382. t := doc.tokenizer
  383. offset := t.offset
  384. parse_attributes(doc, &doc.prologue) or_return
  385. for attr in doc.prologue {
  386. switch attr.key {
  387. case "version":
  388. switch attr.val {
  389. case "1.0", "1.1":
  390. case:
  391. error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val)
  392. }
  393. case "encoding":
  394. runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
  395. switch strings.to_lower(attr.val, context.temp_allocator) {
  396. case "utf-8", "utf8":
  397. doc.encoding = .UTF_8
  398. case "latin-1", "latin1", "iso-8859-1":
  399. doc.encoding = .LATIN_1
  400. case:
  401. // Unrecognized encoding, assume UTF-8.
  402. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
  403. }
  404. case:
  405. // Ignored.
  406. }
  407. }
  408. _ = expect(t, .Question) or_return
  409. _ = expect(t, .Gt) or_return
  410. return .None
  411. }
  412. skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
  413. close := 1
  414. loop: for {
  415. tok := scan(t)
  416. #partial switch tok.kind {
  417. case .EOF:
  418. error(t, t.offset, "[skip_element] Premature EOF\n")
  419. return .Premature_EOF
  420. case .Lt:
  421. close += 1
  422. case .Gt:
  423. close -= 1
  424. if close == 0 {
  425. break loop
  426. }
  427. case:
  428. }
  429. }
  430. return .None
  431. }
  432. parse_doctype :: proc(doc: ^Document) -> (err: Error) {
  433. /*
  434. <!DOCTYPE greeting SYSTEM "hello.dtd">
  435. <!DOCTYPE greeting [
  436. <!ELEMENT greeting (#PCDATA)>
  437. ]>
  438. */
  439. assert(doc != nil)
  440. context.allocator = doc.allocator
  441. t := doc.tokenizer
  442. tok := expect(t, .Ident) or_return
  443. doc.doctype.ident = tok.text
  444. skip_whitespace(t)
  445. offset := t.offset
  446. skip_element(t) or_return
  447. // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
  448. doc.doctype.rest = string(t.src[offset : t.offset - 1])
  449. return .None
  450. }
  451. parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
  452. assert(doc != nil)
  453. context.allocator = doc.allocator
  454. t := doc.tokenizer
  455. body_text := scan_string(t, t.offset) or_return
  456. needs_processing := .Unbox_CDATA in opts.flags
  457. needs_processing |= .Decode_SGML_Entities in opts.flags
  458. if !needs_processing {
  459. append(&doc.elements[element].value, body_text)
  460. return
  461. }
  462. decode_opts := entity.XML_Decode_Options{}
  463. if .Keep_Tag_Body_Comments not_in opts.flags {
  464. decode_opts += { .Comment_Strip }
  465. }
  466. if .Decode_SGML_Entities not_in opts.flags {
  467. decode_opts += { .No_Entity_Decode }
  468. }
  469. if .Unbox_CDATA in opts.flags {
  470. decode_opts += { .Unbox_CDATA }
  471. if .Decode_SGML_Entities in opts.flags {
  472. decode_opts += { .Decode_CDATA }
  473. }
  474. }
  475. decoded, decode_err := entity.decode_xml(body_text, decode_opts)
  476. if decode_err == .None {
  477. append(&doc.elements[element].value, decoded)
  478. append(&doc.strings_to_free, decoded)
  479. } else {
  480. append(&doc.elements[element].value, body_text)
  481. }
  482. return
  483. }
  484. Element_ID :: u32
  485. new_element :: proc(doc: ^Document) -> (id: Element_ID) {
  486. element_space := len(doc.elements)
  487. // Need to resize
  488. if int(doc.element_count) + 1 > element_space {
  489. if element_space < 65536 {
  490. element_space *= 2
  491. } else {
  492. element_space += 65536
  493. }
  494. resize(&doc.elements, element_space)
  495. }
  496. cur := doc.element_count
  497. doc.element_count += 1
  498. return cur
  499. }