1 year ago · ebadff555d
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -56,38 +56,27 @@ CDATA_END     :: "]]>"
 
				 COMMENT_START :: "<!--"
			
 
				 COMMENT_END   :: "-->"
			
 
				 
			
 
				-/*
			
 
				-	Default: CDATA and comments are passed through unchanged.
			
 
				-*/
			
 
				+// Default: CDATA and comments are passed through unchanged.
			
 
				 XML_Decode_Option :: enum u8 {
			
 
				-	/*
			
 
				-		Do not decode & entities. It decodes by default.
			
 
				-		If given, overrides `Decode_CDATA`.
			
 
				-	*/
			
 
				+	// Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`.
			
 
				 	No_Entity_Decode,
			
 
				 
			
 
				-	/*
			
 
				-		CDATA is unboxed.
			
 
				-	*/
			
 
				+	// CDATA is unboxed.
			
 
				 	Unbox_CDATA,
			
 
				 
			
 
				-	/*
			
 
				-		Unboxed CDATA is decoded as well.
			
 
				-		Ignored if `.Unbox_CDATA` is not given.
			
 
				-	*/
			
 
				+	// Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given.
			
 
				 	Decode_CDATA,
			
 
				 
			
 
				-	/*
			
 
				-		Comments are stripped.
			
 
				-	*/
			
 
				+	// Comments are stripped.
			
 
				 	Comment_Strip,
			
 
				+
			
 
				+	// Normalize whitespace
			
 
				+	Normalize_Whitespace,
			
 
				 }
			
 
				 XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
			
 
				 
			
 
				-/*
			
 
				-	Decode a string that may include SGML/XML/HTML entities.
			
 
				-	The caller has to free the result.
			
 
				-*/
			
 
				+// Decode a string that may include SGML/XML/HTML entities.
			
 
				+// The caller has to free the result.
			
 
				 decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
			
 
				 	context.allocator = allocator
			
 
				 
			
@@ -100,14 +89,14 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 	t := Tokenizer{src=input}
			
 
				 	in_data := false
			
 
				 
			
 
				+	prev: rune
			
 
				+
			
 
				 	loop: for {
			
 
				 		advance(&t) or_return
			
 
				 		if t.r < 0 { break loop }
			
 
				 
			
 
				-		/*
			
 
				-			Below here we're never inside a CDATA tag.
			
 
				-			At most we'll see the start of one, but that doesn't affect the logic.
			
 
				-		*/
			
 
				+		// Below here we're never inside a CDATA tag. At most we'll see the start of one,
			
 
				+		// but that doesn't affect the logic.
			
 
				 		switch t.r {
			
 
				 		case '<':
			
 
				 			/*
			
@@ -126,9 +115,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 			in_data = _handle_xml_special(&t, &builder, options) or_return
			
 
				 
			
 
				 		case ']':
			
 
				-			/*
			
 
				-				If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
			
 
				-			*/
			
 
				+			// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
			
 
				 			if in_data {
			
 
				 				if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				 					if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
@@ -143,22 +130,16 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 
			
 
				 		case:
			
 
				 			if in_data && .Decode_CDATA not_in options {
			
 
				-				/*
			
 
				-					Unboxed, but undecoded.
			
 
				-				*/
			
 
				+				// Unboxed, but undecoded.
			
 
				 				write_rune(&builder, t.r)
			
 
				 				continue
			
 
				 			}
			
 
				 
			
 
				 			if t.r == '&' {
			
 
				 				if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
			
 
				-					/*
			
 
				-						We read to the end of the string without closing the entity.
			
 
				-						Pass through as-is.
			
 
				-					*/
			
 
				+					// We read to the end of the string without closing the entity. Pass through as-is.
			
 
				 					write_string(&builder, entity)
			
 
				 				} else {
			
 
				-
			
 
				 					if .No_Entity_Decode not_in options {
			
 
				 						if decoded, ok := xml_decode_entity(entity); ok {
			
 
				 							write_rune(&builder, decoded)
			
@@ -166,19 +147,27 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 						}
			
 
				 					}
			
 
				 
			
 
				-					/*
			
 
				-						Literal passthrough because the decode failed or we want entities not decoded.
			
 
				-					*/
			
 
				+					// Literal passthrough because the decode failed or we want entities not decoded.
			
 
				 					write_string(&builder, "&")
			
 
				 					write_string(&builder, entity)
			
 
				 					write_string(&builder, ";")
			
 
				 				}
			
 
				 			} else {
			
 
				-				write_rune(&builder, t.r)
			
 
				+				// https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends
			
 
				+				switch t.r {
			
 
				+				case '\n', 0x85, 0x2028:
			
 
				+					write_rune(&builder, '\n')
			
 
				+				case '\r': // Do nothing until next character
			
 
				+				case:
			
 
				+					if prev == '\r' { // Turn a single carriage return into a \n
			
 
				+						write_rune(&builder, '\n')
			
 
				+					}
			
 
				+					write_rune(&builder, t.r)
			
 
				+				}
			
 
				+				prev = t.r
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				 	return strings.clone(strings.to_string(builder), allocator), err
			
 
				 }
			
 
				 
			
@@ -253,24 +242,18 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
 
				 		return rune(val), true
			
 
				 
			
 
				 	case:
			
 
				-		/*
			
 
				-			Named entity.
			
 
				-		*/
			
 
				+		// Named entity.
			
 
				 		return named_xml_entity_to_rune(entity)
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				-	Private XML helper to extract `&<stuff>;` entity.
			
 
				-*/
			
 
				+// Private XML helper to extract `&<stuff>;` entity.
			
 
				 @(private="file")
			
 
				 _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
			
 
				 	assert(t != nil && t.r == '&')
			
 
				 
			
 
				-	/*
			
 
				-		All of these would be in the ASCII range.
			
 
				-		Even if one is not, it doesn't matter. All characters we need to compare to extract are.
			
 
				-	*/
			
 
				+	// All of these would be in the ASCII range.
			
 
				+	// Even if one is not, it doesn't matter. All characters we need to compare to extract are.
			
 
				 
			
 
				 	length := len(t.src)
			
 
				 	found  := false
			
@@ -292,9 +275,7 @@ _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
 
				 	return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				-	Private XML helper for CDATA and comments.
			
 
				-*/
			
 
				+// Private XML helper for CDATA and comments.
			
 
				 @(private="file")
			
 
				 _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
			
 
				 	assert(t != nil && t.r == '<')
			
@@ -304,20 +285,14 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 
				 		t.read_offset += len(CDATA_START) - 1
			
 
				 
			
 
				 		if .Unbox_CDATA in options && .Decode_CDATA in options {
			
 
				-			/*
			
 
				-				We're unboxing _and_ decoding CDATA
			
 
				-			*/
			
 
				+			// We're unboxing _and_ decoding CDATA
			
 
				 			return true, .None
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-			CDATA is passed through.
			
 
				-		*/
			
 
				+		// CDATA is passed through.
			
 
				 		offset := t.offset
			
 
				 
			
 
				-		/*
			
 
				-			Scan until end of CDATA.
			
 
				-		*/
			
 
				+		// Scan until end of CDATA.
			
 
				 		for {
			
 
				 			advance(t) or_return
			
 
				 			if t.r < 0 { return true, .CDATA_Not_Terminated }
			
@@ -341,14 +316,10 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 
				 
			
 
				 	} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
			
 
				 		t.read_offset += len(COMMENT_START)
			
 
				-		/*
			
 
				-			Comment is passed through by default.
			
 
				-		*/
			
 
				+		// Comment is passed through by default.
			
 
				 		offset := t.offset
			
 
				 
			
 
				-		/*
			
 
				-			Scan until end of Comment.
			
 
				-		*/
			
 
				+		// Scan until end of Comment.
			
 
				 		for {
			
 
				 			advance(t) or_return
			
 
				 			if t.r < 0 { return true, .Comment_Not_Terminated }
			
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -218,9 +218,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
 
				 	for is_valid_identifier_rune(t.ch) {
			
 
				 		advance_rune(t)
			
 
				 		if t.ch == ':' {
			
 
				-			/*
			
 
				-				A namespaced attr can have at most two parts, `namespace:ident`.
			
 
				-			*/
			
 
				+			// A namespaced attr can have at most two parts, `namespace:ident`.
			
 
				 			if namespaced {
			
 
				 				break	
			
 
				 			}
			
@@ -268,14 +266,10 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
 
				 	return string(t.src[offset : t.offset - 1]), .None
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				-	Skip CDATA
			
 
				-*/
			
 
				+// Skip CDATA
			
 
				 skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
			
 
				 	if t.read_offset + len(CDATA_START) >= len(t.src) {
			
 
				-		/*
			
 
				-			Can't be the start of a CDATA tag.
			
 
				-		*/
			
 
				+		// Can't be the start of a CDATA tag.
			
 
				 		return .None
			
 
				 	}
			
 
				 
			
@@ -290,9 +284,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
 
				 				return .Premature_EOF
			
 
				 			}
			
 
				 
			
 
				-			/*
			
 
				-				Scan until the end of a CDATA tag.
			
 
				-			*/
			
 
				+			// Scan until the end of a CDATA tag.
			
 
				 			if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				 				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				 					t.read_offset += len(CDATA_END)
			
@@ -319,14 +311,10 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 
				 		case '<':
			
 
				 			if peek_byte(t) == '!' {
			
 
				 				if peek_byte(t, 1) == '[' {
			
 
				-					/*
			
 
				-						Might be the start of a CDATA tag.
			
 
				-					*/
			
 
				+					// Might be the start of a CDATA tag.
			
 
				 					skip_cdata(t) or_return
			
 
				 				} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
			
 
				-					/*
			
 
				-						Comment start. Eat comment.
			
 
				-					*/
			
 
				+					// Comment start. Eat comment.
			
 
				 					t.read_offset += 3
			
 
				 					_ = scan_comment(t) or_return
			
 
				 				}
			
@@ -342,17 +330,13 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 
				 		}
			
 
				 
			
 
				 		if t.ch == close {
			
 
				-			/*
			
 
				-				If it's not a CDATA or comment, it's the end of this body.
			
 
				-			*/
			
 
				+			// If it's not a CDATA or comment, it's the end of this body.
			
 
				 			break loop
			
 
				 		}
			
 
				 		advance_rune(t)
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-		Strip trailing whitespace.
			
 
				-	*/
			
 
				+	// Strip trailing whitespace.
			
 
				 	lit := string(t.src[offset : t.offset])
			
 
				 
			
 
				 	end := len(lit)
			
@@ -369,11 +353,6 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 
				 	if consume_close {
			
 
				 		advance_rune(t)
			
 
				 	}
			
 
				-
			
 
				-	/*
			
 
				-		TODO: Handle decoding escape characters and unboxing CDATA.
			
 
				-	*/
			
 
				-
			
 
				 	return lit, err
			
 
				 }
			
 
				 
			
@@ -384,7 +363,7 @@ peek :: proc(t: ^Tokenizer) -> (token: Token) {
 
				 	return token
			
 
				 }
			
 
				 
			
 
				-scan :: proc(t: ^Tokenizer) -> Token {
			
 
				+scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
			
 
				 	skip_whitespace(t)
			
 
				 
			
 
				 	offset := t.offset
			
@@ -418,7 +397,7 @@ scan :: proc(t: ^Tokenizer) -> Token {
 
				 		case '"', '\'':
			
 
				 			kind = .Invalid
			
 
				 
			
 
				-			lit, err = scan_string(t, t.offset, ch, true, false)
			
 
				+			lit, err = scan_string(t, t.offset, ch, true, multiline_string)
			
 
				 			if err == .None {
			
 
				 				kind = .String
			
 
				 			}
			
@@ -435,4 +414,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
 
				 		lit = string(t.src[offset : t.offset])
			
 
				 	}
			
 
				 	return Token{kind, lit, pos}
			
 
				-}
			
 
				+}
			
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -203,9 +203,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 
			
 
				 	doc.elements = make([dynamic]Element, 1024, 1024, allocator)
			
 
				 
			
 
				-	// strings.intern_init(&doc.intern, allocator, allocator)
			
 
				-
			
 
				-	err =            .Unexpected_Token
			
 
				+	err = .Unexpected_Token
			
 
				 	element, parent: Element_ID
			
 
				 	open: Token
			
 
				 
			
@@ -259,8 +257,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 				case .Slash:
			
 
				 					// Empty tag. Close it.
			
 
				 					expect(t, .Gt) or_return
			
 
				-					parent      = doc.elements[element].parent
			
 
				-					element     = parent
			
 
				+					parent  = doc.elements[element].parent
			
 
				+					element = parent
			
 
				 
			
 
				 				case:
			
 
				 					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
			
@@ -276,8 +274,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
			
 
				 					return doc, .Mismatched_Closing_Tag
			
 
				 				}
			
 
				-				parent      = doc.elements[element].parent
			
 
				-				element     = parent
			
 
				+				parent  = doc.elements[element].parent
			
 
				+				element = parent
			
 
				 
			
 
				 			} else if open.kind == .Exclaim {
			
 
				 				// <!
			
@@ -463,8 +461,8 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
 
				 	return validated, .None
			
 
				 }
			
 
				 
			
 
				-expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
			
 
				-	tok = scan(t)
			
 
				+expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
			
 
				+	tok = scan(t, multiline_string=multiline_string)
			
 
				 	if tok.kind == kind { return tok, .None }
			
 
				 
			
 
				 	error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
			
@@ -480,7 +478,13 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
 
				 	offset  = t.offset - len(key.text)
			
 
				 
			
 
				 	_       = expect(t, .Eq)     or_return
			
 
				-	value  := expect(t, .String) or_return
			
 
				+	value  := expect(t, .String, multiline_string=true) or_return
			
 
				+
			
 
				+	normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
			
 
				+	if normalize_err == .None {
			
 
				+		append(&doc.strings_to_free, normalized)
			
 
				+		value.text = normalized
			
 
				+	}
			
 
				 
			
 
				 	attr.key = key.text
			
 
				 	attr.val = value.text
			
--- a/tests/core/encoding/xml/test_core_xml.odin
+++ b/tests/core/encoding/xml/test_core_xml.odin
@@ -36,7 +36,7 @@ xml_test_utf8_normal :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "恥ずべきフクロウ",
			
 
				 		},
			
 
				-		crc32     = 0xe9b62f03,
			
 
				+		crc32     = 0xefa55f27,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -52,7 +52,7 @@ xml_test_utf8_unbox_cdata :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "恥ずべきフクロウ",
			
 
				 		},
			
 
				-		crc32     = 0x9c2643ed,
			
 
				+		crc32     = 0x2dd27770,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -128,7 +128,7 @@ xml_test_entities_unbox :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "html",
			
 
				 		},
			
 
				-		crc32     = 0x3b6d4a90,
			
 
				+		crc32     = 0x350ca83e,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -142,7 +142,7 @@ xml_test_entities_unbox_decode :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "html",
			
 
				 		},
			
 
				-		crc32     = 0x5be2ffdc,
			
 
				+		crc32     = 0x7f58db7d,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -172,7 +172,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
 
				 			expected_doctype = "",
			
 
				 		},
			
 
				 		err       = .None,
			
 
				-		crc32     = 0x0b6100ab,
			
 
				+		crc32     = 0x73070b55,
			
 
				 	})
			
 
				 }