3 년 전 · 23baf56c87
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -46,8 +46,11 @@ Token_Kind :: enum {
 
				 	EOF,
			
 
				 }
			
 
				 
			
 
				-CDATA_START :: "<![CDATA["
			
 
				-CDATA_END   :: "]]>"
			
 
				+CDATA_START   :: "<![CDATA["
			
 
				+CDATA_END     :: "]]>"
			
 
				+
			
 
				+COMMENT_START :: "<!--"
			
 
				+COMMENT_END   :: "-->"
			
 
				 
			
 
				 Tokenizer :: struct {
			
 
				 	// Immutable data
			
@@ -214,10 +217,83 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
 
				 	return string(t.src[offset : t.offset])
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+	A comment ends when we see -->, preceded by a character that's not a dash.
			
 
				+	"For compatibility, the string "--" (double-hyphen) must not occur within comments."
			
 
				+
			
 
				+	See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
			
 
				+
			
 
				+	Thanks to the length (4) of the comment start, we also have enough lookback,
			
 
				+	and the peek at the next byte asserts that there's at least one more character
			
 
				+	that's a `>`.
			
 
				+*/
			
 
				+scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
			
 
				+	offset := t.offset
			
 
				+
			
 
				+	for {
			
 
				+		advance_rune(t)
			
 
				+		ch := t.ch
			
 
				+
			
 
				+		if ch < 0 {
			
 
				+			error(t, offset, "[parse] Comment was not terminated\n")
			
 
				+			return "", .Unclosed_Comment
			
 
				+		}
			
 
				+
			
 
				+		if string(t.src[t.offset - 1:][:2]) == "--" {
			
 
				+			if peek_byte(t) == '>' {
			
 
				+				break
			
 
				+			} else {
			
 
				+				error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
			
 
				+				return "", .Invalid_Sequence_In_Comment
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	expect(t, .Dash)
			
 
				+	expect(t, .Gt)
			
 
				+
			
 
				+	return string(t.src[offset : t.offset - 1]), .None
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+	Skip CDATA
			
 
				+*/
			
 
				+skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
			
 
				+	if t.read_offset + len(CDATA_START) >= len(t.src) {
			
 
				+		/*
			
 
				+			Can't be the start of a CDATA tag.
			
 
				+		*/
			
 
				+		return .None
			
 
				+	}
			
 
				+
			
 
				+	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
			
 
				+		t.read_offset += len(CDATA_START)
			
 
				+		offset := t.offset
			
 
				+
			
 
				+		cdata_scan: for {
			
 
				+			advance_rune(t)
			
 
				+			if t.ch < 0 {
			
 
				+				error(t, offset, "[scan_string] CDATA was not terminated\n")
			
 
				+				return .Premature_EOF
			
 
				+			}
			
 
				+
			
 
				+			/*
			
 
				+				Scan until the end of a CDATA tag.
			
 
				+			*/
			
 
				+			if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				+				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				+					t.read_offset += len(CDATA_END)
			
 
				+					break cdata_scan
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				 @(optimization_mode="speed")
			
 
				 scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
			
 
				 	err = .None
			
 
				-	in_cdata := false
			
 
				 
			
 
				 	loop: for {
			
 
				 		ch := t.ch
			
@@ -228,27 +304,23 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 
				 			return "", .Premature_EOF
			
 
				 
			
 
				 		case '<':
			
 
				-			/*
			
 
				-				Might be the start of a CDATA tag.
			
 
				-			*/
			
 
				-			if t.read_offset + len(CDATA_START) < len(t.src) {
			
 
				-				if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
			
 
				-					in_cdata = true
			
 
				-				}
			
 
				-			}
			
 
				-
			
 
				-		case ']':
			
 
				-			/*
			
 
				-				Might be the end of a CDATA tag.
			
 
				-			*/
			
 
				-			if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				-				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				-					in_cdata = false
			
 
				+			if peek_byte(t) == '!' {
			
 
				+				if peek_byte(t, 1) == '[' {
			
 
				+					/*
			
 
				+						Might be the start of a CDATA tag.
			
 
				+					*/
			
 
				+					skip_cdata(t) or_return
			
 
				+				} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
			
 
				+					/*
			
 
				+						Comment start. Eat comment.
			
 
				+					*/
			
 
				+					t.read_offset += 3
			
 
				+					_ = scan_comment(t) or_return
			
 
				 				}
			
 
				 			}
			
 
				 
			
 
				 		case '\n':
			
 
				-			if !(multiline || in_cdata) {
			
 
				+			if !multiline {
			
 
				 				error(t, offset, string(t.src[offset : t.offset]))
			
 
				 				error(t, offset, "[scan_string] Not terminated\n")
			
 
				 				err = .Invalid_Tag_Value
			
@@ -256,13 +328,12 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		if ch == close && !in_cdata {
			
 
				+		if t.ch == close {
			
 
				 			/*
			
 
				-				If it's not a CDATA tag, it's the end of this body.
			
 
				+				If it's not a CDATA or comment, it's the end of this body.
			
 
				 			*/
			
 
				 			break loop
			
 
				 		}
			
 
				-
			
 
				 		advance_rune(t)
			
 
				 	}
			
 
				 
			
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -307,39 +307,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 
				 						The grammar does not allow a comment to end in --->
			
 
				 					*/
			
 
				 					expect(t, .Dash)
			
 
				-					offset := t.offset
			
 
				-
			
 
				-					for {
			
 
				-						advance_rune(t)
			
 
				-						ch := t.ch
			
 
				-
			
 
				-						/*
			
 
				-							A comment ends when we see -->, preceded by a character that's not a dash.
			
 
				-							"For compatibility, the string "--" (double-hyphen) must not occur within comments."
			
 
				-
			
 
				-							See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
			
 
				-
			
 
				-							Thanks to the length (4) of the comment start, we also have enough lookback,
			
 
				-							and the peek at the next byte asserts that there's at least one more character
			
 
				-							that's a `>`.
			
 
				-						*/
			
 
				-						if ch < 0 {
			
 
				-							error(t, offset, "[parse] Comment was not terminated\n")
			
 
				-							return doc, .Unclosed_Comment
			
 
				-						}
			
 
				-
			
 
				-						if string(t.src[t.offset - 1:][:2]) == "--" {
			
 
				-							if peek_byte(t) == '>' {
			
 
				-								break
			
 
				-							} else {
			
 
				-								error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
			
 
				-								return doc, .Invalid_Sequence_In_Comment
			
 
				-							}
			
 
				-						}
			
 
				-					}
			
 
				+					comment := scan_comment(t) or_return
			
 
				 
			
 
				 					if .Intern_Comments in opts.flags {
			
 
				-						comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
			
 
				+						comment = strings.intern_get(&doc.intern, comment)
			
 
				 
			
 
				 						if doc.root == nil {
			
 
				 							append(&doc.comments, comment)
			
@@ -352,9 +323,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 
				 						}
			
 
				 					}
			
 
				 
			
 
				-					expect(t, .Dash)
			
 
				-					expect(t, .Gt)
			
 
				-
			
 
				 				case:
			
 
				 					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
			
 
				 					return