1 year ago · ea43c030aa
--- a/core/encoding/xml/debug_print.odin
+++ b/core/encoding/xml/debug_print.odin
@@ -1,3 +1,5 @@
 
				+package xml
			
 
				+
			
 
				 /*
			
 
				 	An XML 1.0 / 1.1 parser
			
 
				 
			
@@ -9,7 +11,7 @@
 
				 	List of contributors:
			
 
				 		Jeroen van Rijn: Initial implementation.
			
 
				 */
			
 
				-package xml
			
 
				+
			
 
				 
			
 
				 import "core:io"
			
 
				 import "core:fmt"
			
@@ -81,4 +83,4 @@ print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID,
 
				 	}
			
 
				 
			
 
				 	return written, .None
			
 
				-}
			
 
				+}
			
--- a/core/encoding/xml/example/xml_example.odin
+++ b/core/encoding/xml/example/xml_example.odin
@@ -20,7 +20,7 @@ example :: proc() {
 
				 		xml.destroy(docs[round])
			
 
				 	}
			
 
				 
			
 
				-	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
			
 
				+	DOC :: #load("../../../../tests/core/assets/XML/utf8.xml")
			
 
				 	input := DOC
			
 
				 
			
 
				 	for round in 0..<N {
			
@@ -109,4 +109,4 @@ main :: proc() {
 
				 		}
			
 
				 	}
			
 
				 	println("Done and cleaned up!")
			
 
				-}
			
 
				+}
			
--- a/core/encoding/xml/helpers.odin
+++ b/core/encoding/xml/helpers.odin
@@ -1,3 +1,5 @@
 
				+package xml
			
 
				+
			
 
				 /*
			
 
				 	An XML 1.0 / 1.1 parser
			
 
				 
			
@@ -6,7 +8,7 @@
 
				 
			
 
				 	This file contains helper functions.
			
 
				 */
			
 
				-package xml
			
 
				+
			
 
				 
			
 
				 // Find parent's nth child with a given ident.
			
 
				 find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
			
@@ -47,4 +49,4 @@ find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: st
 
				 		if attr.key == key { return attr.val, true }
			
 
				 	}
			
 
				 	return "", false
			
 
				-}
			
 
				+}
			
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -1,3 +1,5 @@
 
				+package xml
			
 
				+
			
 
				 /*
			
 
				 	An XML 1.0 / 1.1 parser
			
 
				 
			
@@ -9,7 +11,7 @@
 
				 	List of contributors:
			
 
				 		Jeroen van Rijn: Initial implementation.
			
 
				 */
			
 
				-package xml
			
 
				+
			
 
				 
			
 
				 import "core:fmt"
			
 
				 import "core:unicode"
			
@@ -433,4 +435,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
 
				 		lit = string(t.src[offset : t.offset])
			
 
				 	}
			
 
				 	return Token{kind, lit, pos}
			
 
				-}
			
 
				+}
			
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -1,28 +1,28 @@
 
				 /*
			
 
				-	An XML 1.0 / 1.1 parser
			
 
				+ XML 1.0 / 1.1 parser
			
 
				 
			
 
				-	Copyright 2021-2022 Jeroen van Rijn <[email protected]>.
			
 
				-	Made available under Odin's BSD-3 license.
			
 
				+ 2021-2022 Jeroen van Rijn <[email protected]>.
			
 
				+ available under Odin's BSD-3 license.
			
 
				 
			
 
				-	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
			
 
				+ from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
			
 
				 
			
 
				-	Features:
			
 
				-		- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
			
 
				-		- Simple to understand and use. Small.
			
 
				+Features:
			
 
				+- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
			
 
				+- Simple to understand and use. Small.
			
 
				 
			
 
				-	Caveats:
			
 
				-		- We do NOT support HTML in this package, as that may or may not be valid XML.
			
 
				-		  If it works, great. If it doesn't, that's not considered a bug.
			
 
				+Caveats:
			
 
				+- We do NOT support HTML in this package, as that may or may not be valid XML.
			
 
				+  If it works, great. If it doesn't, that's not considered a bug.
			
 
				 
			
 
				-		- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
			
 
				-		- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
			
 
				+- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
			
 
				+- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
			
 
				 
			
 
				-	MAYBE:
			
 
				-	- XML writer?
			
 
				-	- Serialize/deserialize Odin types?
			
 
				+MAYBE:
			
 
				+- XML writer?
			
 
				+- Serialize/deserialize Odin types?
			
 
				 
			
 
				-	List of contributors:
			
 
				-		Jeroen van Rijn: Initial implementation.
			
 
				+List of contributors:
			
 
				+- Jeroen van Rijn: Initial implementation.
			
 
				 */
			
 
				 package xml
			
 
				 // An XML 1.0 / 1.1 parser
			
@@ -43,48 +43,32 @@ DEFAULT_OPTIONS :: Options{
 
				 }
			
 
				 
			
 
				 Option_Flag :: enum {
			
 
				-	/*
			
 
				-		If the caller says that input may be modified, we can perform in-situ parsing.
			
 
				-		If this flag isn't provided, the XML parser first duplicates the input so that it can.
			
 
				-	*/
			
 
				+	// If the caller says that input may be modified, we can perform in-situ parsing.
			
 
				+	// If this flag isn't provided, the XML parser first duplicates the input so that it can.
			
 
				 	Input_May_Be_Modified,
			
 
				 
			
 
				-	/*
			
 
				-		Document MUST start with `<?xml` prologue.
			
 
				-	*/
			
 
				+	// Document MUST start with `<?xml` prologue.
			
 
				 	Must_Have_Prolog,
			
 
				 
			
 
				-	/*
			
 
				-		Document MUST have a `<!DOCTYPE`.
			
 
				-	*/
			
 
				+	// Document MUST have a `<!DOCTYPE`.
			
 
				 	Must_Have_DocType,
			
 
				 
			
 
				-	/*
			
 
				-		By default we skip comments. Use this option to intern a comment on a parented Element.
			
 
				-	*/
			
 
				+	// By default we skip comments. Use this option to intern a comment on a parented Element.
			
 
				 	Intern_Comments,
			
 
				 
			
 
				-	/*
			
 
				-		How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
			
 
				-	*/
			
 
				+	// How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
			
 
				 	Error_on_Unsupported,
			
 
				 	Ignore_Unsupported,
			
 
				 
			
 
				-	/*
			
 
				-		By default CDATA tags are passed-through as-is.
			
 
				-		This option unwraps them when encountered.
			
 
				-	*/
			
 
				+	// By default CDATA tags are passed-through as-is.
			
 
				+	// This option unwraps them when encountered.
			
 
				 	Unbox_CDATA,
			
 
				 
			
 
				-	/*
			
 
				-		By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
			
 
				-		This option decodes them when encountered.
			
 
				-	*/
			
 
				+	// By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
			
 
				+	// This option decodes them when encountered.
			
 
				 	Decode_SGML_Entities,
			
 
				 
			
 
				-	/*
			
 
				-		If a tag body has a comment, it will be stripped unless this option is given.
			
 
				-	*/
			
 
				+	// If a tag body has a comment, it will be stripped unless this option is given.
			
 
				 	Keep_Tag_Body_Comments,
			
 
				 }
			
 
				 Option_Flags :: bit_set[Option_Flag; u16]
			
@@ -97,28 +81,20 @@ Document :: struct {
 
				 	encoding: Encoding,
			
 
				 
			
 
				 	doctype: struct {
			
 
				-		/*
			
 
				-			We only scan the <!DOCTYPE IDENT part and skip the rest.
			
 
				-		*/
			
 
				+		// We only scan the <!DOCTYPE IDENT part and skip the rest.
			
 
				 		ident:   string,
			
 
				 		rest:    string,
			
 
				 	},
			
 
				 
			
 
				-	/*
			
 
				-		If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
			
 
				-		Otherwise they'll be in the element tree.
			
 
				-	*/
			
 
				+	// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
			
 
				+	// Otherwise they'll be in the element tree.
			
 
				 	comments: [dynamic]string,
			
 
				 
			
 
				-	/*
			
 
				-		Internal
			
 
				-	*/
			
 
				+	// Internal
			
 
				 	tokenizer: ^Tokenizer,
			
 
				 	allocator: mem.Allocator,
			
 
				 
			
 
				-	/*
			
 
				-		Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
			
 
				-	*/
			
 
				+	// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
			
 
				 	input:           []u8,
			
 
				 	strings_to_free: [dynamic]string,
			
 
				 }
			
@@ -158,34 +134,24 @@ Encoding :: enum {
 
				 	UTF_8,
			
 
				 	ISO_8859_1,
			
 
				 
			
 
				-	/*
			
 
				-		Aliases
			
 
				-	*/
			
 
				+	// Aliases
			
 
				 	LATIN_1 = ISO_8859_1,
			
 
				 }
			
 
				 
			
 
				 Error :: enum {
			
 
				-	/*
			
 
				-		General return values.
			
 
				-	*/
			
 
				+	// General return values.
			
 
				 	None = 0,
			
 
				 	General_Error,
			
 
				 	Unexpected_Token,
			
 
				 	Invalid_Token,
			
 
				 
			
 
				-	/*
			
 
				-		Couldn't find, open or read file.
			
 
				-	*/
			
 
				+	// Couldn't find, open or read file.
			
 
				 	File_Error,
			
 
				 
			
 
				-	/*
			
 
				-		File too short.
			
 
				-	*/
			
 
				+	// File too short.
			
 
				 	Premature_EOF,
			
 
				 
			
 
				-	/*
			
 
				-		XML-specific errors.
			
 
				-	*/
			
 
				+	// XML-specific errors.
			
 
				 	No_Prolog,
			
 
				 	Invalid_Prolog,
			
 
				 	Too_Many_Prologs,
			
@@ -194,11 +160,9 @@ Error :: enum {
 
				 	Too_Many_DocTypes,
			
 
				 	DocType_Must_Preceed_Elements,
			
 
				 
			
 
				-	/*
			
 
				-		If a DOCTYPE is present _or_ the caller
			
 
				-		asked for a specific DOCTYPE and the DOCTYPE
			
 
				-		and root tag don't match, we return `.Invalid_DocType`.
			
 
				-	*/
			
 
				+	// If a DOCTYPE is present _or_ the caller
			
 
				+	// asked for a specific DOCTYPE and the DOCTYPE
			
 
				+	// and root tag don't match, we return `.Invalid_DocType`.
			
 
				 	Invalid_DocType,
			
 
				 
			
 
				 	Invalid_Tag_Value,
			
@@ -211,27 +175,20 @@ Error :: enum {
 
				 	Unsupported_Version,
			
 
				 	Unsupported_Encoding,
			
 
				 
			
 
				-	/*
			
 
				-		<!FOO are usually skipped.
			
 
				-	*/
			
 
				+	// <!FOO are usually skipped.
			
 
				 	Unhandled_Bang,
			
 
				 
			
 
				 	Duplicate_Attribute,
			
 
				 	Conflicting_Options,
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				-	Implementation starts here.
			
 
				-*/
			
 
				 parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
			
 
				 	data := data
			
 
				 	context.allocator = allocator
			
 
				 
			
 
				 	opts := validate_options(options) or_return
			
 
				 
			
 
				-	/*
			
 
				-		If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
			
 
				-	*/
			
 
				+	// If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
			
 
				 	if .Input_May_Be_Modified not_in opts.flags {
			
 
				 		data = bytes.clone(data)
			
 
				 	}
			
@@ -252,10 +209,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 	element, parent: Element_ID
			
 
				 	open: Token
			
 
				 
			
 
				-	/*
			
 
				-		If a DOCTYPE is present, the root tag has to match.
			
 
				-		If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
			
 
				-	*/
			
 
				+	// If a DOCTYPE is present, the root tag has to match.
			
 
				+	// If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
			
 
				 	expected_doctype := options.expected_doctype
			
 
				 
			
 
				 	loop: for {
			
@@ -263,17 +218,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 		// NOTE(Jeroen): This is faster as a switch.
			
 
				 		switch t.ch {
			
 
				 		case '<':
			
 
				-			/*
			
 
				-				Consume peeked `<`
			
 
				-			*/
			
 
				+			// Consume peeked `<`
			
 
				 			advance_rune(t)
			
 
				 
			
 
				 			open = scan(t)
			
 
				 			// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
			
 
				 			if likely(open.kind, Token_Kind.Ident) == .Ident {
			
 
				-				/*
			
 
				-					e.g. <odin - Start of new element.
			
 
				-				*/
			
 
				+				// e.g. <odin - Start of new element.
			
 
				 				element = new_element(doc)
			
 
				 				if element == 0 { // First Element
			
 
				 					parent = element
			
@@ -286,11 +237,9 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 
			
 
				 				parse_attributes(doc, &doc.elements[element].attribs) or_return
			
 
				 
			
 
				-				/*
			
 
				-					If a DOCTYPE is present _or_ the caller
			
 
				-					asked for a specific DOCTYPE and the DOCTYPE
			
 
				-					and root tag don't match, we return .Invalid_Root_Tag.
			
 
				-				*/
			
 
				+				// If a DOCTYPE is present _or_ the caller
			
 
				+				// asked for a specific DOCTYPE and the DOCTYPE
			
 
				+				// and root tag don't match, we return .Invalid_Root_Tag.
			
 
				 				if element == 0 { // Root tag?
			
 
				 					if len(expected_doctype) > 0 && expected_doctype != open.text {
			
 
				 						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
			
@@ -298,23 +247,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 					}
			
 
				 				}
			
 
				 
			
 
				-				/*
			
 
				-					One of these should follow:
			
 
				-					- `>`,  which means we've just opened this tag and expect a later element to close it.
			
 
				-					- `/>`, which means this is an 'empty' or self-closing tag.
			
 
				-				*/
			
 
				+				// One of these should follow:
			
 
				+				// - `>`,  which means we've just opened this tag and expect a later element to close it.
			
 
				+				// - `/>`, which means this is an 'empty' or self-closing tag.
			
 
				 				end_token := scan(t)
			
 
				 				#partial switch end_token.kind {
			
 
				 				case .Gt:
			
 
				-					/*
			
 
				-						We're now the new parent.
			
 
				-					*/
			
 
				+					// We're now the new parent.
			
 
				 					parent = element
			
 
				 
			
 
				 				case .Slash:
			
 
				-					/*
			
 
				-						Empty tag. Close it.
			
 
				-					*/
			
 
				+					// Empty tag. Close it.
			
 
				 					expect(t, .Gt) or_return
			
 
				 					parent      = doc.elements[element].parent
			
 
				 					element     = parent
			
@@ -325,9 +268,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 				}
			
 
				 
			
 
				 			} else if open.kind == .Slash {
			
 
				-				/*
			
 
				-					Close tag.
			
 
				-				*/
			
 
				+				// Close tag.
			
 
				 				ident := expect(t, .Ident) or_return
			
 
				 				_      = expect(t, .Gt)    or_return
			
 
				 
			
@@ -339,9 +280,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 				element     = parent
			
 
				 
			
 
				 			} else if open.kind == .Exclaim {
			
 
				-				/*
			
 
				-					<!
			
 
				-				*/
			
 
				+				// <!
			
 
				 				next := scan(t)
			
 
				 				#partial switch next.kind {
			
 
				 				case .Ident:
			
@@ -370,10 +309,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 					}
			
 
				 
			
 
				 				case .Dash:
			
 
				-					/*
			
 
				-						Comment: <!-- -->.
			
 
				-						The grammar does not allow a comment to end in --->
			
 
				-					*/
			
 
				+					// Comment: <!-- -->.
			
 
				+					// The grammar does not allow a comment to end in --->
			
 
				 					expect(t, .Dash)
			
 
				 					comment := scan_comment(t) or_return
			
 
				 
			
@@ -395,23 +332,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 				}
			
 
				 
			
 
				 			} else if open.kind == .Question {
			
 
				-				/*
			
 
				-					<?xml
			
 
				-				*/
			
 
				+				// <?xml
			
 
				 				next := scan(t)
			
 
				 				#partial switch next.kind {
			
 
				 				case .Ident:
			
 
				 					if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
			
 
				 						parse_prologue(doc) or_return
			
 
				 					} else if len(doc.prologue) > 0 {
			
 
				-						/*
			
 
				-							We've already seen a prologue.
			
 
				-						*/
			
 
				+						// We've already seen a prologue.
			
 
				 						return doc, .Too_Many_Prologs
			
 
				 					} else {
			
 
				-						/*
			
 
				-							Could be `<?xml-stylesheet`, etc. Ignore it.
			
 
				-						*/
			
 
				+						// Could be `<?xml-stylesheet`, etc. Ignore it.
			
 
				 						skip_element(t) or_return
			
 
				 					}
			
 
				 				case:
			
@@ -425,15 +356,11 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 			}
			
 
				 
			
 
				 		case -1:
			
 
				-			/*
			
 
				-				End of file.
			
 
				-			*/
			
 
				+			// End of file.
			
 
				 			break loop
			
 
				 
			
 
				 		case:
			
 
				-			/*
			
 
				-				This should be a tag's body text.
			
 
				-			*/
			
 
				+			// This should be a tag's body text.
			
 
				 			body_text        := scan_string(t, t.offset) or_return
			
 
				 			needs_processing := .Unbox_CDATA          in opts.flags
			
 
				 			needs_processing |= .Decode_SGML_Entities in opts.flags
			
@@ -613,9 +540,7 @@ parse_prologue :: proc(doc: ^Document) -> (err: Error) {
 
				 				doc.encoding = .LATIN_1
			
 
				 
			
 
				 			case:
			
 
				-				/*
			
 
				-					Unrecognized encoding, assume UTF-8.
			
 
				-				*/
			
 
				+				// Unrecognized encoding, assume UTF-8.
			
 
				 				error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
			
 
				 			}
			
 
				 
			
@@ -658,11 +583,11 @@ skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
 
				 
			
 
				 parse_doctype :: proc(doc: ^Document) -> (err: Error) {
			
 
				 	/*
			
 
				-		<!DOCTYPE greeting SYSTEM "hello.dtd">
			
 
				+	<!DOCTYPE greeting SYSTEM "hello.dtd">
			
 
				 
			
 
				-		<!DOCTYPE greeting [
			
 
				-			<!ELEMENT greeting (#PCDATA)>
			
 
				-		]>
			
 
				+	<!DOCTYPE greeting [
			
 
				+		<!ELEMENT greeting (#PCDATA)>
			
 
				+	]>
			
 
				 	*/
			
 
				 	assert(doc != nil)
			
 
				 	context.allocator = doc.allocator
			
@@ -675,9 +600,7 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
 
				 	offset := t.offset
			
 
				 	skip_element(t) or_return
			
 
				 
			
 
				-	/*
			
 
				-		-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
			
 
				-	*/
			
 
				+	// 	-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
			
 
				 	doc.doctype.rest = string(t.src[offset : t.offset - 1])
			
 
				 	return .None
			
 
				 }
			
@@ -700,4 +623,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
 
				 	cur := doc.element_count
			
 
				 	doc.element_count += 1
			
 
				 	return cur
			
 
				-}
			
 
				+}