Selaa lähdekoodia

[xml] Implement optional unboxing of CDATA and decoding of tag values.

Jeroen van Rijn 3 vuotta sitten
vanhempi
commit
3d72e80ccf

+ 24 - 15
core/encoding/entity/entity.odin

@@ -60,16 +60,22 @@ COMMENT_END   :: "-->"
 	Default: CDATA and comments are passed through unchanged.
 */
 XML_Decode_Option :: enum u8 {
+	/*
+		Do not decode & entities. It decodes by default.
+		If given, overrides `Decode_CDATA`.
+	*/
+	No_Entity_Decode,
+
 	/*
 		CDATA is unboxed.
 	*/
-	CDATA_Unbox,
+	Unbox_CDATA,
 
 	/*
 		Unboxed CDATA is decoded as well.
-		Ignored if `.CDATA_Unbox` is not given.
+		Ignored if `.Unbox_CDATA` is not given.
 	*/
-	CDATA_Decode,
+	Decode_CDATA,
 
 	/*
 		Comments are stripped.
@@ -129,7 +135,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 			}
 
 		case:
-			if in_data && .CDATA_Decode not_in options {
+			if in_data && .Decode_CDATA not_in options {
 				/*
 					Unboxed, but undecoded.
 				*/
@@ -145,17 +151,20 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 					*/
 					write_string(&builder, entity)
 				} else {
-					if decoded, ok := xml_decode_entity(entity); ok {
-						write_rune(&builder, decoded)
-					} else {
-						/*
-							Decode failed. Pass through original.
-						*/
-						write_string(&builder, "&")
-						write_string(&builder, entity)
-						write_string(&builder, ";")
+
+					if .No_Entity_Decode not_in options {
+						if decoded, ok := xml_decode_entity(entity); ok {
+							write_rune(&builder, decoded)
+							continue
+						}
 					}
 
+					/*
+						Literal passthrough because the decode failed or we want entities not decoded.
+					*/
+					write_string(&builder, "&")
+					write_string(&builder, entity)
+					write_string(&builder, ";")
 				}
 			} else {
 				write_rune(&builder, t.r)
@@ -290,7 +299,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
 		t.read_offset += len(CDATA_START) - 1
 
-		if .CDATA_Unbox in options && .CDATA_Decode in options {
+		if .Unbox_CDATA in options && .Decode_CDATA in options {
 			/*
 				We're unboxing _and_ decoding CDATA
 			*/
@@ -315,7 +324,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 
 					cdata := string(t.src[offset : t.read_offset])
 	
-					if .CDATA_Unbox in options {
+					if .Unbox_CDATA in options {
 						cdata = cdata[len(CDATA_START):]
 						cdata = cdata[:len(cdata) - len(CDATA_END)]
 					}

+ 10 - 57
core/encoding/entity/example/entity_example.odin

@@ -1,19 +1,11 @@
 package unicode_entity_example
 
 import "core:encoding/xml"
-import "core:encoding/entity"
 import "core:strings"
 import "core:mem"
 import "core:fmt"
 import "core:time"
 
-OPTIONS  :: xml.Options{
-	flags            = {
-		.Ignore_Unsupported, .Intern_Comments,
-	},
-	expected_doctype = "",
-}
-
 doc_print :: proc(doc: ^xml.Document) {
 	buf: strings.Builder
 	defer strings.destroy_builder(&buf)
@@ -29,6 +21,13 @@ _entities :: proc() {
 
 	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
 
+	OPTIONS  :: xml.Options{
+		flags            = {
+			.Ignore_Unsupported, .Intern_Comments,
+		},
+		expected_doctype = "",
+	}
+
 	parse_duration: time.Duration
 
 	{
@@ -50,57 +49,11 @@ _entities :: proc() {
 _main :: proc() {
 	using fmt
 
-	doc, err := xml.parse(#load("test.html"))
+	options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
+	doc, _ := xml.parse(#load("test.html"), options)
+
 	defer xml.destroy(doc)
 	doc_print(doc)
-
-	if false {
-		val := doc.root.children[1].children[2].value
-
-		println()
-		replaced, ok := entity.decode_xml(val)
-		defer delete(replaced)
-
-		printf("Before:      '%v', Err: %v\n", val, err)
-		printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
-		println()
-	}
-
-	if false {
-		val := doc.root.children[1].children[2].value
-
-		println()
-		replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
-		defer delete(replaced)
-
-		printf("Before:      '%v', Err: %v\n", val, err)
-		printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
-		println()
-	}
-
-	if true {
-		val := doc.root.children[1].children[2].value
-
-		println()
-		replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
-		defer delete(replaced)
-
-		printf("Before: '%v', Err: %v\n", val, err)
-		printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
-		println()
-	}
-
-	if true {
-		val := doc.root.children[1].children[1].value
-
-		println()
-		replaced, ok := entity.decode_xml(val, { .Comment_Strip })
-		defer delete(replaced)
-
-		printf("Before: '%v', Err: %v\n", val, err)
-		printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
-		println()
-	}
 }
 
 main :: proc() {

+ 2 - 0
core/encoding/entity/example/test.html

@@ -16,9 +16,11 @@
 		<div id="test_cdata_in_comment" foo="">
 			Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
 		</div>
+		<!-- EXPECTED: Foozle]! © 42&;1234& -->
 		<div id="test_cdata_unwrap_and_passthrough">
 			Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
 		</div>
+		<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
 		<div>
 			&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral;
 		</div>

+ 20 - 21
core/encoding/xml/xml_reader.odin

@@ -18,10 +18,6 @@ package xml
 		- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
 		- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
 
-	TODO:
-	- Optional CDATA unboxing.
-	- Optional `&gt;`, `&#32;`, `&#x20;` and other escape substitution in tag bodies.
-
 	MAYBE:
 	- XML writer?
 	- Serialize/deserialize Odin types?
@@ -31,6 +27,7 @@ package xml
 */
 
 import "core:strings"
+import "core:encoding/entity"
 import "core:mem"
 import "core:os"
 
@@ -196,12 +193,6 @@ Error :: enum {
 
 	Duplicate_Attribute,
 	Conflicting_Options,
-
-	/*
-		Unhandled TODO:
-	*/
-	Unhandled_CDATA_Unboxing,
-	Unhandled_SGML_Entity_Decoding,
 }
 
 /*
@@ -422,8 +413,25 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 			/*
 				This should be a tag's body text.
 			*/
-			body_text    := scan_string(t, t.offset) or_return
-			element.value = strings.intern_get(&doc.intern, body_text)
+			body_text   := scan_string(t, t.offset) or_return
+
+			decode_opts := entity.XML_Decode_Options{ .Comment_Strip }
+
+			if .Decode_SGML_Entities not_in opts.flags {
+				decode_opts += { .No_Entity_Decode }
+			}
+			if .Unbox_CDATA in opts.flags {
+				decode_opts += { .Unbox_CDATA, .Decode_CDATA }
+			}
+
+			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+			defer delete(decoded)
+
+			if decode_err == .None {
+				element.value = strings.intern_get(&doc.intern, decoded)
+			} else {
+				element.value = strings.intern_get(&doc.intern, body_text)
+			}
 		}
 	}
 
@@ -488,15 +496,6 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
 	if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
 		return options, .Conflicting_Options
 	}
-
-	if .Unbox_CDATA in validated.flags {
-		return options, .Unhandled_CDATA_Unboxing
-	}
-
-	if .Decode_SGML_Entities in validated.flags {
-		return options, .Unhandled_SGML_Entity_Decoding
-	}
-
 	return validated, .None
 }