entity.odin 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. Copyright 2021 Jeroen van Rijn <[email protected]>.
  3. Made available under Odin's BSD-3 license.
  4. List of contributors:
  5. Jeroen van Rijn: Initial implementation.
  6. */
  7. /*
  8. A unicode entity encoder/decoder.
  9. This code has several procedures to map unicode runes to/from different textual encodings.
  10. - SGML/XML/HTML entity
  11. - &#<decimal>;
  12. - &#x<hexadecimal>;
  13. - &<entity name>; (If the lookup tables are compiled in).
  14. Reference: [[ https://www.w3.org/2003/entities/2007xml/unicode.xml ]]
  15. - URL encode / decode %hex entity
  16. Reference: [[ https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1 ]]
  17. */
  18. package encoding_unicode_entity
  19. import "core:unicode/utf8"
  20. import "core:unicode"
  21. import "core:strings"
  22. MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
  23. write_rune :: strings.write_rune
  24. write_string :: strings.write_string
  25. Error :: enum u8 {
  26. None = 0,
  27. Tokenizer_Is_Nil,
  28. Illegal_NUL_Character,
  29. Illegal_UTF_Encoding,
  30. Illegal_BOM,
  31. CDATA_Not_Terminated,
  32. Comment_Not_Terminated,
  33. Invalid_Entity_Encoding,
  34. }
  35. Tokenizer :: struct {
  36. r: rune,
  37. w: int,
  38. src: string,
  39. offset: int,
  40. read_offset: int,
  41. }
  42. CDATA_START :: "<![CDATA["
  43. CDATA_END :: "]]>"
  44. COMMENT_START :: "<!--"
  45. COMMENT_END :: "-->"
  46. // Default: CDATA and comments are passed through unchanged.
  47. XML_Decode_Option :: enum u8 {
  48. // Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`.
  49. No_Entity_Decode,
  50. // CDATA is unboxed.
  51. Unbox_CDATA,
  52. // Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given.
  53. Decode_CDATA,
  54. // Comments are stripped.
  55. Comment_Strip,
  56. // Normalize whitespace
  57. Normalize_Whitespace,
  58. }
  59. XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
  60. // Decode a string that may include SGML/XML/HTML entities.
  61. // The caller has to free the result.
  62. decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
  63. context.allocator = allocator
  64. l := len(input)
  65. if l == 0 { return "", .None }
  66. builder := strings.builder_make()
  67. defer strings.builder_destroy(&builder)
  68. t := Tokenizer{src=input}
  69. in_data := false
  70. prev: rune = ' '
  71. loop: for {
  72. advance(&t) or_return
  73. if t.r < 0 { break loop }
  74. // Below here we're never inside a CDATA tag. At most we'll see the start of one,
  75. // but that doesn't affect the logic.
  76. switch t.r {
  77. case '<':
  78. /*
  79. Might be the start of a CDATA tag or comment.
  80. We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
  81. it couldn't have been part of an XML tag body to be decoded here.
  82. Keep in mind that we could already *be* inside a CDATA tag.
  83. If so, write `>` as a literal and continue.
  84. */
  85. if in_data {
  86. write_rune(&builder, '<')
  87. continue
  88. }
  89. in_data = _handle_xml_special(&t, &builder, options) or_return
  90. case ']':
  91. // If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
  92. if in_data {
  93. if t.read_offset + len(CDATA_END) < len(t.src) {
  94. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  95. in_data = false
  96. t.read_offset += len(CDATA_END) - 1
  97. }
  98. }
  99. continue
  100. } else {
  101. write_rune(&builder, ']')
  102. }
  103. case:
  104. if in_data && .Decode_CDATA not_in options {
  105. // Unboxed, but undecoded.
  106. write_rune(&builder, t.r)
  107. continue
  108. }
  109. if t.r == '&' {
  110. if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
  111. // We read to the end of the string without closing the entity. Pass through as-is.
  112. write_string(&builder, entity)
  113. } else {
  114. if .No_Entity_Decode not_in options {
  115. if decoded, ok := xml_decode_entity(entity); ok {
  116. write_rune(&builder, decoded)
  117. continue
  118. }
  119. }
  120. // Literal passthrough because the decode failed or we want entities not decoded.
  121. write_string(&builder, "&")
  122. write_string(&builder, entity)
  123. write_string(&builder, ";")
  124. }
  125. } else {
  126. // Handle AV Normalization: https://www.w3.org/TR/2006/REC-xml11-20060816/#AVNormalize
  127. if .Normalize_Whitespace in options {
  128. switch t.r {
  129. case ' ', '\r', '\n', '\t':
  130. if prev != ' ' {
  131. write_rune(&builder, ' ')
  132. prev = ' '
  133. }
  134. case:
  135. write_rune(&builder, t.r)
  136. prev = t.r
  137. }
  138. } else {
  139. // https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends
  140. switch t.r {
  141. case '\n', 0x85, 0x2028:
  142. write_rune(&builder, '\n')
  143. case '\r': // Do nothing until next character
  144. case:
  145. if prev == '\r' { // Turn a single carriage return into a \n
  146. write_rune(&builder, '\n')
  147. }
  148. write_rune(&builder, t.r)
  149. }
  150. prev = t.r
  151. }
  152. }
  153. }
  154. }
  155. return strings.clone(strings.to_string(builder), allocator), err
  156. }
  157. advance :: proc(t: ^Tokenizer) -> (err: Error) {
  158. if t == nil { return .Tokenizer_Is_Nil }
  159. #no_bounds_check {
  160. if t.read_offset < len(t.src) {
  161. t.offset = t.read_offset
  162. t.r, t.w = rune(t.src[t.read_offset]), 1
  163. switch {
  164. case t.r == 0:
  165. return .Illegal_NUL_Character
  166. case t.r >= utf8.RUNE_SELF:
  167. t.r, t.w = utf8.decode_rune_in_string(t.src[t.read_offset:])
  168. if t.r == utf8.RUNE_ERROR && t.w == 1 {
  169. return .Illegal_UTF_Encoding
  170. } else if t.r == utf8.RUNE_BOM && t.offset > 0 {
  171. return .Illegal_BOM
  172. }
  173. }
  174. t.read_offset += t.w
  175. return .None
  176. } else {
  177. t.offset = len(t.src)
  178. t.r = -1
  179. return
  180. }
  181. }
  182. }
  183. xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
  184. entity := entity
  185. if len(entity) == 0 { return -1, false }
  186. switch entity[0] {
  187. case '#':
  188. base := 10
  189. val := 0
  190. entity = entity[1:]
  191. if len(entity) == 0 { return -1, false }
  192. if entity[0] == 'x' || entity[0] == 'X' {
  193. base = 16
  194. entity = entity[1:]
  195. }
  196. for len(entity) > 0 {
  197. r := entity[0]
  198. switch r {
  199. case '0'..='9':
  200. val *= base
  201. val += int(r - '0')
  202. case 'a'..='f':
  203. if base == 10 { return -1, false }
  204. val *= base
  205. val += int(r - 'a' + 10)
  206. case 'A'..='F':
  207. if base == 10 { return -1, false }
  208. val *= base
  209. val += int(r - 'A' + 10)
  210. case:
  211. return -1, false
  212. }
  213. if val > MAX_RUNE_CODEPOINT { return -1, false }
  214. entity = entity[1:]
  215. }
  216. return rune(val), true
  217. case:
  218. // Named entity.
  219. return named_xml_entity_to_rune(entity)
  220. }
  221. }
  222. // Private XML helper to extract `&<stuff>;` entity.
  223. @(private="file")
  224. _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
  225. assert(t != nil && t.r == '&')
  226. // All of these would be in the ASCII range.
  227. // Even if one is not, it doesn't matter. All characters we need to compare to extract are.
  228. length := len(t.src)
  229. found := false
  230. #no_bounds_check {
  231. for t.read_offset < length {
  232. if t.src[t.read_offset] == ';' {
  233. t.read_offset += 1
  234. found = true
  235. break
  236. }
  237. t.read_offset += 1
  238. }
  239. }
  240. if found {
  241. return string(t.src[t.offset + 1 : t.read_offset - 1]), .None
  242. }
  243. return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding
  244. }
  245. // Private XML helper for CDATA and comments.
  246. @(private="file")
  247. _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
  248. assert(t != nil && t.r == '<')
  249. if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
  250. if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
  251. t.read_offset += len(CDATA_START) - 1
  252. if .Unbox_CDATA in options && .Decode_CDATA in options {
  253. // We're unboxing _and_ decoding CDATA
  254. return true, .None
  255. }
  256. // CDATA is passed through.
  257. offset := t.offset
  258. // Scan until end of CDATA.
  259. for {
  260. advance(t) or_return
  261. if t.r < 0 { return true, .CDATA_Not_Terminated }
  262. if t.read_offset + len(CDATA_END) < len(t.src) {
  263. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  264. t.read_offset += len(CDATA_END) - 1
  265. cdata := string(t.src[offset : t.read_offset])
  266. if .Unbox_CDATA in options {
  267. cdata = cdata[len(CDATA_START):]
  268. cdata = cdata[:len(cdata) - len(CDATA_END)]
  269. }
  270. write_string(builder, cdata)
  271. return false, .None
  272. }
  273. }
  274. }
  275. } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
  276. t.read_offset += len(COMMENT_START)
  277. // Comment is passed through by default.
  278. offset := t.offset
  279. // Scan until end of Comment.
  280. for {
  281. advance(t) or_return
  282. if t.r < 0 { return true, .Comment_Not_Terminated }
  283. if t.read_offset + len(COMMENT_END) < len(t.src) {
  284. if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
  285. t.read_offset += len(COMMENT_END) - 1
  286. if .Comment_Strip not_in options {
  287. comment := string(t.src[offset : t.read_offset])
  288. write_string(builder, comment)
  289. }
  290. return false, .None
  291. }
  292. }
  293. }
  294. }
  295. return false, .None
  296. }