Browse Source

[xml] Robustness improvement.

Can now parse  https://www.w3.org/2003/entities/2007xml/unicode.xml no problem.
Jeroen van Rijn 3 years ago
parent
commit
ec63d0bbd2
2 changed files with 55 additions and 24 deletions
  1. 4 0
      core/encoding/xml/debug_print.odin
  2. 51 24
      core/encoding/xml/xml_reader.odin

+ 4 - 0
core/encoding/xml/debug_print.odin

@@ -36,6 +36,10 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
 		}
 	}
 
+	for comment in doc.comments {
+		written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
+	}
+
 	if doc.root != nil {
 	 	wprintln(writer, " --- ")
 	 	print_element(writer, doc.root)

+ 51 - 24
core/encoding/xml/xml_reader.odin

@@ -86,10 +86,16 @@ Document :: struct {
 		/*
 			We only scan the <!DOCTYPE IDENT part and skip the rest.
 		*/
-		ident: string,
-		rest:  string,
+		ident:   string,
+		rest:    string,
 	},
 
+	/*
+		If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
+		Otherwise they'll be in the element tree.
+	*/
+	comments: [dynamic]string,
+
 	/*
 		Internal
 	*/
@@ -218,6 +224,8 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 	err =               .Unexpected_Token
 	element, parent:    ^Element
 
+	tag_is_open := false
+
 	/*
 		If a DOCTYPE is present, the root tag has to match.
 		If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
@@ -225,10 +233,14 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 	expected_doctype := options.expected_doctype
 
 	loop: for {
-		tok := scan(t)
-		#partial switch tok.kind {
+		skip_whitespace(t)
+		switch t.ch {
+		case '<':
+			/*
+				Consume peeked `<`
+			*/
+			tok := scan(t)
 
-		case .Lt:
 			open := scan(t)
 			#partial switch open.kind {
 
@@ -247,8 +259,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 						*/
 						return doc, .Too_Many_Prologs
 					} else {
-						error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)	
-						return
+						/*
+							Could be `<?xml-stylesheet`, etc. Ignore it.
+						*/
+						skip_element(t) or_return
 					}
 				case:
 					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
@@ -292,10 +306,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 						Comment: <!-- -->.
 						The grammar does not allow a comment to end in --->
 					*/
-					if doc.root == nil {
-						return doc, .Comment_Before_Root_Element
-					}
-
 					expect(t, .Dash)
 					offset := t.offset
 
@@ -329,12 +339,17 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					}
 
 					if .Intern_Comments in opts.flags {
-						el := new(Element)
-
-						el.parent = element
-						el.kind   = .Comment
-						el.value  = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
-						append(&element.children, el)
+						comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
+
+						if doc.root == nil {
+							append(&doc.comments, comment)
+						} else {
+							el := new(Element)
+							el.parent = element
+							el.kind   = .Comment
+							el.value  = comment
+							append(&element.children, el)
+						}
 					}
 
 					expect(t, .Dash)
@@ -350,6 +365,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					e.g. <odin - Start of new element.
 				*/
 				element = new(Element)
+				tag_is_open = true
 
 				if doc.root == nil {
 					/*
@@ -384,7 +400,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					- `/>`, which means this is an 'empty' or self-closing tag.
 				*/
 				end_token := scan(t)
-
 				#partial switch end_token.kind {
 				case .Gt:
 					/*
@@ -394,9 +409,12 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 
 				case .Slash:
 					/*
-						Empty tag?
+						Empty tag. Close it.
 					*/
 					expect(t, .Gt) or_return
+					parent      = element.parent
+					element     = parent
+					tag_is_open = false
 
 				case:
 					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
@@ -411,25 +429,33 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 				_      = expect(t, .Gt)    or_return
 
 				if element.ident != ident.text {
-					error(t, t.offset, "Mismatched Closing Tag: %v\n", ident.text)
+					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
 					return doc, .Mismatched_Closing_Tag
 				}
-				parent  = element.parent
-				element = parent
+				parent      = element.parent
+				element     = parent
+				tag_is_open = false
 
 			case:
 				error(t, t.offset, "Invalid Token after <: %#v\n", open)
 				return
 			}
 
-		case .EOF:
+		case -1:
+			/*
+				End of file.
+			*/
+			if tag_is_open {
+				return doc, .Premature_EOF
+			}
 			break loop
 
 		case:
 			/*
 				This should be a tag's body text.
 			*/
-			element.value = scan_string(t, tok.pos.offset) or_return
+			body_text    := scan_string(t, t.offset) or_return
+			element.value = strings.intern_get(&doc.intern, body_text)
 		}
 	}
 
@@ -480,6 +506,7 @@ destroy :: proc(doc: ^Document) {
 	strings.intern_destroy(&doc.intern)
 
 	delete(doc.prolog)
+	delete(doc.comments)
 	free(doc)
 }