4 months ago · 062a3c2fae
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -108,7 +108,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 				it couldn't have been part of an XML tag body to be decoded here.
			
 
				 
			
 
				 				Keep in mind that we could already *be* inside a CDATA tag.
			
 
				-				If so, write `>` as a literal and continue.
			
 
				+				If so, write `<` as a literal and continue.
			
 
				 			*/
			
 
				 			if in_data {
			
 
				 				write_rune(&builder, '<')
			
@@ -119,11 +119,9 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 
				 		case ']':
			
 
				 			// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
			
 
				 			if in_data {
			
 
				-				if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				-					if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				-						in_data = false
			
 
				-						t.read_offset += len(CDATA_END) - 1
			
 
				-					}
			
 
				+				if strings.has_prefix(t.src[t.offset:], CDATA_END) {
			
 
				+					in_data = false
			
 
				+					t.read_offset += len(CDATA_END) - 1
			
 
				 				}
			
 
				 				continue
			
 
				 			} else {
			
@@ -297,40 +295,40 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 
				 	assert(t != nil && t.r == '<')
			
 
				 	if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
			
 
				 
			
 
				-	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
			
 
				-		t.read_offset += len(CDATA_START) - 1
			
 
				-
			
 
				+	s := string(t.src[t.offset:])
			
 
				+	if strings.has_prefix(s, CDATA_START) {
			
 
				 		if .Unbox_CDATA in options && .Decode_CDATA in options {
			
 
				 			// We're unboxing _and_ decoding CDATA
			
 
				+			t.read_offset += len(CDATA_START) - 1
			
 
				 			return true, .None
			
 
				 		}
			
 
				 
			
 
				-		// CDATA is passed through.
			
 
				-		offset := t.offset
			
 
				-
			
 
				-		// Scan until end of CDATA.
			
 
				+		// CDATA is passed through. Scan until end of CDATA.
			
 
				+		start_offset  := t.offset
			
 
				+		t.read_offset += len(CDATA_START)
			
 
				 		for {
			
 
				-			advance(t) or_return
			
 
				-			if t.r < 0 { return true, .CDATA_Not_Terminated }
			
 
				-
			
 
				-			if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				-				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				-					t.read_offset += len(CDATA_END) - 1
			
 
				+			advance(t)
			
 
				+			if t.r < 0 {
			
 
				+				// error(t, offset, "[scan_string] CDATA was not terminated\n")
			
 
				+				return true, .CDATA_Not_Terminated
			
 
				+			}
			
 
				 
			
 
				-					cdata := string(t.src[offset : t.read_offset])
			
 
				-	
			
 
				-					if .Unbox_CDATA in options {
			
 
				-						cdata = cdata[len(CDATA_START):]
			
 
				-						cdata = cdata[:len(cdata) - len(CDATA_END)]
			
 
				-					}
			
 
				+			// Scan until the end of a CDATA tag.
			
 
				+			if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
			
 
				+				t.read_offset += len(CDATA_END)
			
 
				+				cdata := string(t.src[start_offset:t.read_offset])
			
 
				 
			
 
				-					write_string(builder, cdata)
			
 
				-					return false, .None
			
 
				+				if .Unbox_CDATA in options {
			
 
				+					cdata = cdata[len(CDATA_START):]
			
 
				+					cdata = cdata[:len(cdata) - len(CDATA_END)]
			
 
				 				}
			
 
				+				write_string(builder, cdata)
			
 
				+				return false, .None
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-	} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
			
 
				+
			
 
				+	} else if strings.has_prefix(s, COMMENT_START) {
			
 
				 		t.read_offset += len(COMMENT_START)
			
 
				 		// Comment is passed through by default.
			
 
				 		offset := t.offset
			
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -16,6 +16,7 @@ package encoding_xml
 
				 import "core:fmt"
			
 
				 import "core:unicode"
			
 
				 import "core:unicode/utf8"
			
 
				+import "core:strings"
			
 
				 
			
 
				 Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
			
 
				 
			
@@ -121,7 +122,7 @@ default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
 
				 error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
			
 
				 	pos := offset_to_pos(t, offset)
			
 
				 	if t.err != nil {
			
 
				-		t.err(pos, msg, ..args)
			
 
				+		t.err(pos=pos, fmt=msg, args=args)
			
 
				 	}
			
 
				 	t.error_count += 1
			
 
				 }
			
@@ -268,32 +269,27 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
 
				 
			
 
				 // Skip CDATA
			
 
				 skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
			
 
				-	if t.read_offset + len(CDATA_START) >= len(t.src) {
			
 
				-		// Can't be the start of a CDATA tag.
			
 
				+	if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
			
 
				 		return .None
			
 
				 	}
			
 
				 
			
 
				-	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
			
 
				-		t.read_offset += len(CDATA_START)
			
 
				-		offset := t.offset
			
 
				+	t.read_offset += len(CDATA_START)
			
 
				+	offset := t.offset
			
 
				 
			
 
				-		cdata_scan: for {
			
 
				-			advance_rune(t)
			
 
				-			if t.ch < 0 {
			
 
				-				error(t, offset, "[scan_string] CDATA was not terminated\n")
			
 
				-				return .Premature_EOF
			
 
				-			}
			
 
				+	cdata_scan: for {
			
 
				+		advance_rune(t)
			
 
				+		if t.ch < 0 {
			
 
				+			error(t, offset, "[scan_string] CDATA was not terminated\n")
			
 
				+			return .Premature_EOF
			
 
				+		}
			
 
				 
			
 
				-			// Scan until the end of a CDATA tag.
			
 
				-			if t.read_offset + len(CDATA_END) < len(t.src) {
			
 
				-				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
			
 
				-					t.read_offset += len(CDATA_END)
			
 
				-					break cdata_scan
			
 
				-				}
			
 
				-			}
			
 
				+		// Scan until the end of a CDATA tag.
			
 
				+		if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
			
 
				+			t.read_offset += len(CDATA_END)
			
 
				+			break cdata_scan
			
 
				 		}
			
 
				 	}
			
 
				-	return
			
 
				+	return .None
			
 
				 }
			
 
				 
			
 
				 @(optimization_mode="favor_size")
			
@@ -393,6 +389,8 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
 
				 		case '/': kind = .Slash
			
 
				 		case '-': kind = .Dash
			
 
				 		case ':': kind = .Colon
			
 
				+		case '[': kind = .Open_Bracket
			
 
				+		case ']': kind = .Close_Bracket
			
 
				 
			
 
				 		case '"', '\'':
			
 
				 			kind = .Invalid
			
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -56,7 +56,7 @@ Option_Flag :: enum {
 
				 Option_Flags :: bit_set[Option_Flag; u16]
			
 
				 
			
 
				 Document :: struct {
			
 
				-	elements:      [dynamic]Element,
			
 
				+	elements:      [dynamic]Element `fmt:"v,element_count"`,
			
 
				 	element_count: Element_ID,
			
 
				 
			
 
				 	prologue: Attributes,
			
@@ -70,15 +70,15 @@ Document :: struct {
 
				 
			
 
				 	// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
			
 
				 	// Otherwise they'll be in the element tree.
			
 
				-	comments: [dynamic]string,
			
 
				+	comments: [dynamic]string        `fmt:"-"`,
			
 
				 
			
 
				 	// Internal
			
 
				-	tokenizer: ^Tokenizer,
			
 
				-	allocator: mem.Allocator,
			
 
				+	tokenizer: ^Tokenizer            `fmt:"-"`,
			
 
				+	allocator: mem.Allocator         `fmt:"-"`,
			
 
				 
			
 
				 	// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
			
 
				-	input:           []u8,
			
 
				-	strings_to_free: [dynamic]string,
			
 
				+	input:           []u8            `fmt:"-"`,
			
 
				+	strings_to_free: [dynamic]string `fmt:"-"`,
			
 
				 }
			
 
				 
			
 
				 Element :: struct {
			
@@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 
			
 
				 	loop: for {
			
 
				 		skip_whitespace(t)
			
 
				-		// NOTE(Jeroen): This is faster as a switch.
			
 
				 		switch t.ch {
			
 
				 		case '<':
			
 
				 			// Consume peeked `<`
			
@@ -306,9 +305,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 						}
			
 
				 					}
			
 
				 
			
 
				+				case .Open_Bracket:
			
 
				+					// This could be a CDATA tag part of a tag's body. Unread the `<![`
			
 
				+					t.offset -= 3
			
 
				+					parse_body(doc, element, opts) or_return
			
 
				+
			
 
				 				case:
			
 
				-					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
			
 
				-					return
			
 
				+					error(t, t.offset, "Unexpected Token after <!: %#v", next)
			
 
				 				}
			
 
				 
			
 
				 			} else if open.kind == .Question {
			
@@ -341,38 +344,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
				 
			
 
				 		case:
			
 
				 			// This should be a tag's body text.
			
 
				-			body_text        := scan_string(t, t.offset) or_return
			
 
				-			needs_processing := .Unbox_CDATA          in opts.flags
			
 
				-			needs_processing |= .Decode_SGML_Entities in opts.flags
			
 
				-
			
 
				-			if !needs_processing {
			
 
				-				append(&doc.elements[element].value, body_text)
			
 
				-				continue
			
 
				-			}
			
 
				-
			
 
				-			decode_opts := entity.XML_Decode_Options{}
			
 
				-			if .Keep_Tag_Body_Comments not_in opts.flags {
			
 
				-				decode_opts += { .Comment_Strip }
			
 
				-			}
			
 
				-
			
 
				-			if .Decode_SGML_Entities not_in opts.flags {
			
 
				-				decode_opts += { .No_Entity_Decode }
			
 
				-			}
			
 
				-
			
 
				-			if .Unbox_CDATA in opts.flags {
			
 
				-				decode_opts += { .Unbox_CDATA }
			
 
				-				if .Decode_SGML_Entities in opts.flags {
			
 
				-					decode_opts += { .Decode_CDATA }
			
 
				-				}
			
 
				-			}
			
 
				-
			
 
				-			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
			
 
				-			if decode_err == .None {
			
 
				-				append(&doc.elements[element].value, decoded)
			
 
				-				append(&doc.strings_to_free, decoded)
			
 
				-			} else {
			
 
				-				append(&doc.elements[element].value, body_text)
			
 
				-			}
			
 
				+			parse_body(doc, element, opts) or_return
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -457,8 +429,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
 
				 	t := doc.tokenizer
			
 
				 
			
 
				 	key    := expect(t, .Ident)  or_return
			
 
				-	offset  = t.offset - len(key.text)
			
 
				-
			
 
				 	_       = expect(t, .Eq)     or_return
			
 
				 	value  := expect(t, .String, multiline_string=true) or_return
			
 
				 
			
@@ -591,6 +561,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
 
				 	return .None
			
 
				 }
			
 
				 
			
 
				+parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
			
 
				+	assert(doc != nil)
			
 
				+	context.allocator = doc.allocator
			
 
				+	t := doc.tokenizer
			
 
				+
			
 
				+	body_text        := scan_string(t, t.offset) or_return
			
 
				+	needs_processing := .Unbox_CDATA          in opts.flags
			
 
				+	needs_processing |= .Decode_SGML_Entities in opts.flags
			
 
				+
			
 
				+	if !needs_processing {
			
 
				+		append(&doc.elements[element].value, body_text)
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	decode_opts := entity.XML_Decode_Options{}
			
 
				+	if .Keep_Tag_Body_Comments not_in opts.flags {
			
 
				+		decode_opts += { .Comment_Strip }
			
 
				+	}
			
 
				+
			
 
				+	if .Decode_SGML_Entities not_in opts.flags {
			
 
				+		decode_opts += { .No_Entity_Decode }
			
 
				+	}
			
 
				+
			
 
				+	if .Unbox_CDATA in opts.flags {
			
 
				+		decode_opts += { .Unbox_CDATA }
			
 
				+		if .Decode_SGML_Entities in opts.flags {
			
 
				+			decode_opts += { .Decode_CDATA }
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	decoded, decode_err := entity.decode_xml(body_text, decode_opts)
			
 
				+	if decode_err == .None {
			
 
				+		append(&doc.elements[element].value, decoded)
			
 
				+		append(&doc.strings_to_free, decoded)
			
 
				+	} else {
			
 
				+		append(&doc.elements[element].value, body_text)
			
 
				+	}
			
 
				+
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				 Element_ID :: u32
			
 
				 
			
 
				 new_element :: proc(doc: ^Document) -> (id: Element_ID) {
			
@@ -609,4 +620,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
 
				 	cur := doc.element_count
			
 
				 	doc.element_count += 1
			
 
				 	return cur
			
 
				-}
			
 
				+}
			
--- a/tests/core/assets/XML/entities.html
+++ b/tests/core/assets/XML/entities.html
@@ -25,5 +25,10 @@
 
				 		<div>

			
 
				 			&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral; &bsemi;

			
 
				 		</div>

			
 
				+		<div> H<![CDATA[ellope!]]>Hellope!</div>

			
 
				+		<div>	H<![CDATA[ellope!]]>Hellope!</div>

			
 
				+		<div> H<![CDATA[ellope!]]>Hellope! </div>

			
 
				+		<div>	H<![CDATA[ellope!]]>Hellope!	</div>

			
 
				+		<div>H<![CDATA[ellope!]]>Hellope! </div>

			
 
				 	</body>

			
 
				 </html>
			
--- a/tests/core/encoding/xml/test_core_xml.odin
+++ b/tests/core/encoding/xml/test_core_xml.odin
@@ -114,7 +114,7 @@ xml_test_entities :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "html",
			
 
				 		},
			
 
				-		crc32     = 0x05373317,
			
 
				+		crc32     = 0x48f41216,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -128,7 +128,7 @@ xml_test_entities_unbox :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "html",
			
 
				 		},
			
 
				-		crc32     = 0x350ca83e,
			
 
				+		crc32     = 0xd0567818,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -142,7 +142,7 @@ xml_test_entities_unbox_decode :: proc(t: ^testing.T) {
 
				 			},
			
 
				 			expected_doctype = "html",
			
 
				 		},
			
 
				-		crc32     = 0x7f58db7d,
			
 
				+		crc32     = 0x68d2571e,
			
 
				 	})
			
 
				 }
			
 
				 
			
@@ -191,7 +191,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
 
				 }
			
 
				 
			
 
				 @(private)
			
 
				-run_test :: proc(t: ^testing.T, test: TEST) {
			
 
				+run_test :: proc(t: ^testing.T, test: TEST, loc := #caller_location) {
			
 
				 	path := strings.concatenate({TEST_SUITE_PATH, test.filename})
			
 
				 	defer delete(path)
			
 
				 
			
@@ -205,10 +205,10 @@ run_test :: proc(t: ^testing.T, test: TEST) {
 
				 	crc32 := hash.crc32(tree_bytes)
			
 
				 
			
 
				 	failed := err != test.err
			
 
				-	testing.expectf(t, err == test.err, "%v: Expected return value %v, got %v", test.filename, test.err, err)
			
 
				+	testing.expectf(t, err == test.err, "%v: Expected return value %v, got %v", test.filename, test.err, err, loc=loc)
			
 
				 
			
 
				 	failed |= crc32 != test.crc32
			
 
				-	testing.expectf(t, crc32 == test.crc32, "%v: Expected CRC 0x%08x, got 0x%08x, with options %v", test.filename, test.crc32, crc32, test.options)
			
 
				+	testing.expectf(t, crc32 == test.crc32, "%v: Expected CRC 0x%08x, got 0x%08x, with options %v", test.filename, test.crc32, crc32, test.options, loc=loc)
			
 
				 
			
 
				 	if failed {
			
 
				 		// Don't fully print big trees.