|
@@ -1,28 +1,28 @@
|
|
|
/*
|
|
|
- An XML 1.0 / 1.1 parser
|
|
|
+ XML 1.0 / 1.1 parser
|
|
|
|
|
|
- Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
|
|
|
- Made available under Odin's BSD-3 license.
|
|
|
+ 2021-2022 Jeroen van Rijn <[email protected]>.
|
|
|
+ available under Odin's BSD-3 license.
|
|
|
|
|
|
- A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
|
|
|
+ from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
|
|
|
|
|
|
- Features:
|
|
|
- - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
|
|
|
- - Simple to understand and use. Small.
|
|
|
+Features:
|
|
|
+- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
|
|
|
+- Simple to understand and use. Small.
|
|
|
|
|
|
- Caveats:
|
|
|
- - We do NOT support HTML in this package, as that may or may not be valid XML.
|
|
|
- If it works, great. If it doesn't, that's not considered a bug.
|
|
|
+Caveats:
|
|
|
+- We do NOT support HTML in this package, as that may or may not be valid XML.
|
|
|
+ If it works, great. If it doesn't, that's not considered a bug.
|
|
|
|
|
|
- - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
|
|
|
- - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
|
|
|
+- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
|
|
|
+- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
|
|
|
|
|
|
- MAYBE:
|
|
|
- - XML writer?
|
|
|
- - Serialize/deserialize Odin types?
|
|
|
+MAYBE:
|
|
|
+- XML writer?
|
|
|
+- Serialize/deserialize Odin types?
|
|
|
|
|
|
- List of contributors:
|
|
|
- Jeroen van Rijn: Initial implementation.
|
|
|
+List of contributors:
|
|
|
+- Jeroen van Rijn: Initial implementation.
|
|
|
*/
|
|
|
package xml
|
|
|
// An XML 1.0 / 1.1 parser
|
|
@@ -43,48 +43,32 @@ DEFAULT_OPTIONS :: Options{
|
|
|
}
|
|
|
|
|
|
Option_Flag :: enum {
|
|
|
- /*
|
|
|
- If the caller says that input may be modified, we can perform in-situ parsing.
|
|
|
- If this flag isn't provided, the XML parser first duplicates the input so that it can.
|
|
|
- */
|
|
|
+ // If the caller says that input may be modified, we can perform in-situ parsing.
|
|
|
+ // If this flag isn't provided, the XML parser first duplicates the input so that it can.
|
|
|
Input_May_Be_Modified,
|
|
|
|
|
|
- /*
|
|
|
- Document MUST start with `<?xml` prologue.
|
|
|
- */
|
|
|
+ // Document MUST start with `<?xml` prologue.
|
|
|
Must_Have_Prolog,
|
|
|
|
|
|
- /*
|
|
|
- Document MUST have a `<!DOCTYPE`.
|
|
|
- */
|
|
|
+ // Document MUST have a `<!DOCTYPE`.
|
|
|
Must_Have_DocType,
|
|
|
|
|
|
- /*
|
|
|
- By default we skip comments. Use this option to intern a comment on a parented Element.
|
|
|
- */
|
|
|
+ // By default we skip comments. Use this option to intern a comment on a parented Element.
|
|
|
Intern_Comments,
|
|
|
|
|
|
- /*
|
|
|
- How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
|
|
|
- */
|
|
|
+ // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
|
|
|
Error_on_Unsupported,
|
|
|
Ignore_Unsupported,
|
|
|
|
|
|
- /*
|
|
|
- By default CDATA tags are passed-through as-is.
|
|
|
- This option unwraps them when encountered.
|
|
|
- */
|
|
|
+ // By default CDATA tags are passed-through as-is.
|
|
|
+ // This option unwraps them when encountered.
|
|
|
Unbox_CDATA,
|
|
|
|
|
|
- /*
|
|
|
- By default SGML entities like `>`, ` ` and ` ` are passed-through as-is.
|
|
|
- This option decodes them when encountered.
|
|
|
- */
|
|
|
+ // By default SGML entities like `>`, ` ` and ` ` are passed-through as-is.
|
|
|
+ // This option decodes them when encountered.
|
|
|
Decode_SGML_Entities,
|
|
|
|
|
|
- /*
|
|
|
- If a tag body has a comment, it will be stripped unless this option is given.
|
|
|
- */
|
|
|
+ // If a tag body has a comment, it will be stripped unless this option is given.
|
|
|
Keep_Tag_Body_Comments,
|
|
|
}
|
|
|
Option_Flags :: bit_set[Option_Flag; u16]
|
|
@@ -97,28 +81,20 @@ Document :: struct {
|
|
|
encoding: Encoding,
|
|
|
|
|
|
doctype: struct {
|
|
|
- /*
|
|
|
- We only scan the <!DOCTYPE IDENT part and skip the rest.
|
|
|
- */
|
|
|
+ // We only scan the <!DOCTYPE IDENT part and skip the rest.
|
|
|
ident: string,
|
|
|
rest: string,
|
|
|
},
|
|
|
|
|
|
- /*
|
|
|
- If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
|
|
|
- Otherwise they'll be in the element tree.
|
|
|
- */
|
|
|
+ // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
|
|
|
+ // Otherwise they'll be in the element tree.
|
|
|
comments: [dynamic]string,
|
|
|
|
|
|
- /*
|
|
|
- Internal
|
|
|
- */
|
|
|
+ // Internal
|
|
|
tokenizer: ^Tokenizer,
|
|
|
allocator: mem.Allocator,
|
|
|
|
|
|
- /*
|
|
|
- Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
|
|
|
- */
|
|
|
+ // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
|
|
|
input: []u8,
|
|
|
strings_to_free: [dynamic]string,
|
|
|
}
|
|
@@ -158,34 +134,24 @@ Encoding :: enum {
|
|
|
UTF_8,
|
|
|
ISO_8859_1,
|
|
|
|
|
|
- /*
|
|
|
- Aliases
|
|
|
- */
|
|
|
+ // Aliases
|
|
|
LATIN_1 = ISO_8859_1,
|
|
|
}
|
|
|
|
|
|
Error :: enum {
|
|
|
- /*
|
|
|
- General return values.
|
|
|
- */
|
|
|
+ // General return values.
|
|
|
None = 0,
|
|
|
General_Error,
|
|
|
Unexpected_Token,
|
|
|
Invalid_Token,
|
|
|
|
|
|
- /*
|
|
|
- Couldn't find, open or read file.
|
|
|
- */
|
|
|
+ // Couldn't find, open or read file.
|
|
|
File_Error,
|
|
|
|
|
|
- /*
|
|
|
- File too short.
|
|
|
- */
|
|
|
+ // File too short.
|
|
|
Premature_EOF,
|
|
|
|
|
|
- /*
|
|
|
- XML-specific errors.
|
|
|
- */
|
|
|
+ // XML-specific errors.
|
|
|
No_Prolog,
|
|
|
Invalid_Prolog,
|
|
|
Too_Many_Prologs,
|
|
@@ -194,11 +160,9 @@ Error :: enum {
|
|
|
Too_Many_DocTypes,
|
|
|
DocType_Must_Preceed_Elements,
|
|
|
|
|
|
- /*
|
|
|
- If a DOCTYPE is present _or_ the caller
|
|
|
- asked for a specific DOCTYPE and the DOCTYPE
|
|
|
- and root tag don't match, we return `.Invalid_DocType`.
|
|
|
- */
|
|
|
+ // If a DOCTYPE is present _or_ the caller
|
|
|
+ // asked for a specific DOCTYPE and the DOCTYPE
|
|
|
+ // and root tag don't match, we return `.Invalid_DocType`.
|
|
|
Invalid_DocType,
|
|
|
|
|
|
Invalid_Tag_Value,
|
|
@@ -211,27 +175,20 @@ Error :: enum {
|
|
|
Unsupported_Version,
|
|
|
Unsupported_Encoding,
|
|
|
|
|
|
- /*
|
|
|
- <!FOO are usually skipped.
|
|
|
- */
|
|
|
+ // <!FOO are usually skipped.
|
|
|
Unhandled_Bang,
|
|
|
|
|
|
Duplicate_Attribute,
|
|
|
Conflicting_Options,
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- Implementation starts here.
|
|
|
-*/
|
|
|
parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
|
|
|
data := data
|
|
|
context.allocator = allocator
|
|
|
|
|
|
opts := validate_options(options) or_return
|
|
|
|
|
|
- /*
|
|
|
- If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
|
|
|
- */
|
|
|
+ // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
|
|
|
if .Input_May_Be_Modified not_in opts.flags {
|
|
|
data = bytes.clone(data)
|
|
|
}
|
|
@@ -252,10 +209,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
element, parent: Element_ID
|
|
|
open: Token
|
|
|
|
|
|
- /*
|
|
|
- If a DOCTYPE is present, the root tag has to match.
|
|
|
- If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
|
|
|
- */
|
|
|
+ // If a DOCTYPE is present, the root tag has to match.
|
|
|
+ // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
|
|
|
expected_doctype := options.expected_doctype
|
|
|
|
|
|
loop: for {
|
|
@@ -263,17 +218,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
// NOTE(Jeroen): This is faster as a switch.
|
|
|
switch t.ch {
|
|
|
case '<':
|
|
|
- /*
|
|
|
- Consume peeked `<`
|
|
|
- */
|
|
|
+ // Consume peeked `<`
|
|
|
advance_rune(t)
|
|
|
|
|
|
open = scan(t)
|
|
|
// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
|
|
|
if likely(open.kind, Token_Kind.Ident) == .Ident {
|
|
|
- /*
|
|
|
- e.g. <odin - Start of new element.
|
|
|
- */
|
|
|
+ // e.g. <odin - Start of new element.
|
|
|
element = new_element(doc)
|
|
|
if element == 0 { // First Element
|
|
|
parent = element
|
|
@@ -286,11 +237,9 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
|
|
|
parse_attributes(doc, &doc.elements[element].attribs) or_return
|
|
|
|
|
|
- /*
|
|
|
- If a DOCTYPE is present _or_ the caller
|
|
|
- asked for a specific DOCTYPE and the DOCTYPE
|
|
|
- and root tag don't match, we return .Invalid_Root_Tag.
|
|
|
- */
|
|
|
+ // If a DOCTYPE is present _or_ the caller
|
|
|
+ // asked for a specific DOCTYPE and the DOCTYPE
|
|
|
+ // and root tag don't match, we return .Invalid_Root_Tag.
|
|
|
if element == 0 { // Root tag?
|
|
|
if len(expected_doctype) > 0 && expected_doctype != open.text {
|
|
|
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
|
|
@@ -298,23 +247,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- One of these should follow:
|
|
|
- - `>`, which means we've just opened this tag and expect a later element to close it.
|
|
|
- - `/>`, which means this is an 'empty' or self-closing tag.
|
|
|
- */
|
|
|
+ // One of these should follow:
|
|
|
+ // - `>`, which means we've just opened this tag and expect a later element to close it.
|
|
|
+ // - `/>`, which means this is an 'empty' or self-closing tag.
|
|
|
end_token := scan(t)
|
|
|
#partial switch end_token.kind {
|
|
|
case .Gt:
|
|
|
- /*
|
|
|
- We're now the new parent.
|
|
|
- */
|
|
|
+ // We're now the new parent.
|
|
|
parent = element
|
|
|
|
|
|
case .Slash:
|
|
|
- /*
|
|
|
- Empty tag. Close it.
|
|
|
- */
|
|
|
+ // Empty tag. Close it.
|
|
|
expect(t, .Gt) or_return
|
|
|
parent = doc.elements[element].parent
|
|
|
element = parent
|
|
@@ -325,9 +268,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
}
|
|
|
|
|
|
} else if open.kind == .Slash {
|
|
|
- /*
|
|
|
- Close tag.
|
|
|
- */
|
|
|
+ // Close tag.
|
|
|
ident := expect(t, .Ident) or_return
|
|
|
_ = expect(t, .Gt) or_return
|
|
|
|
|
@@ -339,9 +280,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
element = parent
|
|
|
|
|
|
} else if open.kind == .Exclaim {
|
|
|
- /*
|
|
|
- <!
|
|
|
- */
|
|
|
+ // <!
|
|
|
next := scan(t)
|
|
|
#partial switch next.kind {
|
|
|
case .Ident:
|
|
@@ -370,10 +309,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
}
|
|
|
|
|
|
case .Dash:
|
|
|
- /*
|
|
|
- Comment: <!-- -->.
|
|
|
- The grammar does not allow a comment to end in --->
|
|
|
- */
|
|
|
+ // Comment: <!-- -->.
|
|
|
+ // The grammar does not allow a comment to end in --->
|
|
|
expect(t, .Dash)
|
|
|
comment := scan_comment(t) or_return
|
|
|
|
|
@@ -395,23 +332,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
}
|
|
|
|
|
|
} else if open.kind == .Question {
|
|
|
- /*
|
|
|
- <?xml
|
|
|
- */
|
|
|
+ // <?xml
|
|
|
next := scan(t)
|
|
|
#partial switch next.kind {
|
|
|
case .Ident:
|
|
|
if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
|
|
|
parse_prologue(doc) or_return
|
|
|
} else if len(doc.prologue) > 0 {
|
|
|
- /*
|
|
|
- We've already seen a prologue.
|
|
|
- */
|
|
|
+ // We've already seen a prologue.
|
|
|
return doc, .Too_Many_Prologs
|
|
|
} else {
|
|
|
- /*
|
|
|
- Could be `<?xml-stylesheet`, etc. Ignore it.
|
|
|
- */
|
|
|
+ // Could be `<?xml-stylesheet`, etc. Ignore it.
|
|
|
skip_element(t) or_return
|
|
|
}
|
|
|
case:
|
|
@@ -425,15 +356,11 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
|
|
}
|
|
|
|
|
|
case -1:
|
|
|
- /*
|
|
|
- End of file.
|
|
|
- */
|
|
|
+ // End of file.
|
|
|
break loop
|
|
|
|
|
|
case:
|
|
|
- /*
|
|
|
- This should be a tag's body text.
|
|
|
- */
|
|
|
+ // This should be a tag's body text.
|
|
|
body_text := scan_string(t, t.offset) or_return
|
|
|
needs_processing := .Unbox_CDATA in opts.flags
|
|
|
needs_processing |= .Decode_SGML_Entities in opts.flags
|
|
@@ -613,9 +540,7 @@ parse_prologue :: proc(doc: ^Document) -> (err: Error) {
|
|
|
doc.encoding = .LATIN_1
|
|
|
|
|
|
case:
|
|
|
- /*
|
|
|
- Unrecognized encoding, assume UTF-8.
|
|
|
- */
|
|
|
+ // Unrecognized encoding, assume UTF-8.
|
|
|
error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
|
|
|
}
|
|
|
|
|
@@ -658,11 +583,11 @@ skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
|
|
|
|
|
|
parse_doctype :: proc(doc: ^Document) -> (err: Error) {
|
|
|
/*
|
|
|
- <!DOCTYPE greeting SYSTEM "hello.dtd">
|
|
|
+ <!DOCTYPE greeting SYSTEM "hello.dtd">
|
|
|
|
|
|
- <!DOCTYPE greeting [
|
|
|
- <!ELEMENT greeting (#PCDATA)>
|
|
|
- ]>
|
|
|
+ <!DOCTYPE greeting [
|
|
|
+ <!ELEMENT greeting (#PCDATA)>
|
|
|
+ ]>
|
|
|
*/
|
|
|
assert(doc != nil)
|
|
|
context.allocator = doc.allocator
|
|
@@ -675,9 +600,7 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
|
|
|
offset := t.offset
|
|
|
skip_element(t) or_return
|
|
|
|
|
|
- /*
|
|
|
- -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
|
|
|
- */
|
|
|
+ // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
|
|
|
doc.doctype.rest = string(t.src[offset : t.offset - 1])
|
|
|
return .None
|
|
|
}
|
|
@@ -700,4 +623,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
|
|
|
cur := doc.element_count
|
|
|
doc.element_count += 1
|
|
|
return cur
|
|
|
-}
|
|
|
+}
|