/* XML 1.0 / 1.1 parser 2021-2022 Jeroen van Rijn . available under Odin's BSD-3 license. from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). Features: - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. - Simple to understand and use. Small. Caveats: - We do NOT support HTML in this package, as that may or may not be valid XML. If it works, great. If it doesn't, that's not considered a bug. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. MAYBE: - XML writer? - Serialize/deserialize Odin types? List of contributors: - Jeroen van Rijn: Initial implementation. */ package encoding_xml // An XML 1.0 / 1.1 parser import "core:bytes" import "core:encoding/entity" import "base:intrinsics" import "core:mem" import "core:os" import "core:strings" import "base:runtime" likely :: intrinsics.expect DEFAULT_OPTIONS :: Options{ flags = {.Ignore_Unsupported}, expected_doctype = "", } Option_Flag :: enum { // If the caller says that input may be modified, we can perform in-situ parsing. // If this flag isn't provided, the XML parser first duplicates the input so that it can. Input_May_Be_Modified, // Document MUST start with ` (doc: ^Document, err: Error) { data := data context.allocator = allocator opts := validate_options(options) or_return // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place. if .Input_May_Be_Modified not_in opts.flags { data = bytes.clone(data) } t := &Tokenizer{} init(t, string(data), path, error_handler) doc = new(Document) doc.allocator = allocator doc.tokenizer = t doc.input = data doc.elements = make([dynamic]Element, 1024, 1024, allocator) err = .Unexpected_Token element, parent: Element_ID open: Token // If a DOCTYPE is present, the root tag has to match. // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match. expected_doctype := options.expected_doctype loop: for { skip_whitespace(t) // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': // Consume peeked `<` advance_rune(t) open = scan(t) // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed. if likely(open.kind, Token_Kind.Ident) == .Ident { // e.g. 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) return doc, .Invalid_DocType } } // One of these should follow: // - `>`, which means we've just opened this tag and expect a later element to close it. // - `/>`, which means this is an 'empty' or self-closing tag. end_token := scan(t) #partial switch end_token.kind { case .Gt: // We're now the new parent. parent = element case .Slash: // Empty tag. Close it. expect(t, .Gt) or_return parent = doc.elements[element].parent element = parent case: error(t, t.offset, "Expected close tag, got: %#v\n", end_token) return } } else if open.kind == .Slash { // Close tag. ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return if doc.elements[element].ident != ident.text { error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } parent = doc.elements[element].parent element = parent } else if open.kind == .Exclaim { // 0 { return doc, .Too_Many_DocTypes } if doc.element_count > 0 { return doc, .DocType_Must_Preceed_Elements } parse_doctype(doc) or_return if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) return doc, .Invalid_DocType } expected_doctype = doc.doctype.ident case: if .Error_on_Unsupported in opts.flags { error(t, t.offset, "Unhandled: . // The grammar does not allow a comment to end in ---> expect(t, .Dash) comment := scan_comment(t) or_return if .Intern_Comments in opts.flags { if len(doc.elements) == 0 { append(&doc.comments, comment) } else { el := new_element(doc) doc.elements[el].parent = element doc.elements[el].kind = .Comment append(&doc.elements[el].value, comment) append(&doc.elements[element].value, el) } } case: error(t, t.offset, "Invalid Token after 0 { // We've already seen a prologue. return doc, .Too_Many_Prologs } else { // Could be ` (doc: ^Document, err: Error) { _data := transmute([]u8)data return parse_bytes(_data, options, path, error_handler, allocator) } parse :: proc { parse_string, parse_bytes } // Load an XML file load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { context.allocator = allocator options := options data, data_ok := os.read_entire_file(filename) if !data_ok { return {}, .File_Error } options.flags += { .Input_May_Be_Modified } return parse_bytes(data, options, filename, error_handler, allocator) } destroy :: proc(doc: ^Document) { if doc == nil { return } for el in doc.elements { delete(el.attribs) delete(el.value) } delete(doc.elements) delete(doc.prologue) delete(doc.comments) delete(doc.input) for s in doc.strings_to_free { delete(s) } delete(doc.strings_to_free) free(doc) } /* Helpers. */ validate_options :: proc(options: Options) -> (validated: Options, err: Error) { validated = options if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags { return options, .Conflicting_Options } return validated, .None } expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) { tok = scan(t, multiline_string=multiline_string) if tok.kind == kind { return tok, .None } error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind) return tok, .Unexpected_Token } parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) { assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer key := expect(t, .Ident) or_return offset = t.offset - len(key.text) _ = expect(t, .Eq) or_return value := expect(t, .String, multiline_string=true) or_return normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator) if normalize_err == .None { append(&doc.strings_to_free, normalized) value.text = normalized } attr.key = key.text attr.val = value.text err = .None return } check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) { for a in attribs { if attr.key == a.key { error(t, offset, "Duplicate attribute: %v\n", attr.key) return .Duplicate_Attribute } } return .None } parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) { assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer for peek(t).kind == .Ident { attr, offset := parse_attribute(doc) or_return check_duplicate_attributes(t, attribs^, attr, offset) or_return append(attribs, attr) } skip_whitespace(t) return .None } parse_prologue :: proc(doc: ^Document) -> (err: Error) { assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer offset := t.offset parse_attributes(doc, &doc.prologue) or_return for attr in doc.prologue { switch attr.key { case "version": switch attr.val { case "1.0", "1.1": case: error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val) } case "encoding": runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() switch strings.to_lower(attr.val, context.temp_allocator) { case "utf-8", "utf8": doc.encoding = .UTF_8 case "latin-1", "latin1", "iso-8859-1": doc.encoding = .LATIN_1 case: // Unrecognized encoding, assume UTF-8. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val) } case: // Ignored. } } _ = expect(t, .Question) or_return _ = expect(t, .Gt) or_return return .None } skip_element :: proc(t: ^Tokenizer) -> (err: Error) { close := 1 loop: for { tok := scan(t) #partial switch tok.kind { case .EOF: error(t, t.offset, "[skip_element] Premature EOF\n") return .Premature_EOF case .Lt: close += 1 case .Gt: close -= 1 if close == 0 { break loop } case: } } return .None } parse_doctype :: proc(doc: ^Document) -> (err: Error) { /* ]> */ assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer tok := expect(t, .Ident) or_return doc.doctype.ident = tok.text skip_whitespace(t) offset := t.offset skip_element(t) or_return // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it. doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None } Element_ID :: u32 new_element :: proc(doc: ^Document) -> (id: Element_ID) { element_space := len(doc.elements) // Need to resize if int(doc.element_count) + 1 > element_space { if element_space < 65536 { element_space *= 2 } else { element_space += 65536 } resize(&doc.elements, element_space) } cur := doc.element_count doc.element_count += 1 return cur }