entity.odin 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. package unicode_entity
  2. /*
  3. A unicode entity encoder/decoder
  4. Copyright 2021 Jeroen van Rijn <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. This code has several procedures to map unicode runes to/from different textual encodings.
  7. - SGML/XML/HTML entity
  8. -- &#<decimal>;
  9. -- &#x<hexadecimal>;
  10. -- &<entity name>; (If the lookup tables are compiled in).
  11. Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml
  12. - URL encode / decode %hex entity
  13. Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
  14. List of contributors:
  15. Jeroen van Rijn: Initial implementation.
  16. */
  17. import "core:unicode/utf8"
  18. import "core:unicode"
  19. import "core:strings"
  20. MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
  21. write_rune :: strings.write_rune
  22. write_string :: strings.write_string
  23. Error :: enum u8 {
  24. None = 0,
  25. Tokenizer_Is_Nil,
  26. Illegal_NUL_Character,
  27. Illegal_UTF_Encoding,
  28. Illegal_BOM,
  29. CDATA_Not_Terminated,
  30. Comment_Not_Terminated,
  31. Invalid_Entity_Encoding,
  32. }
  33. Tokenizer :: struct {
  34. r: rune,
  35. w: int,
  36. src: string,
  37. offset: int,
  38. read_offset: int,
  39. }
  40. CDATA_START :: "<![CDATA["
  41. CDATA_END :: "]]>"
  42. COMMENT_START :: "<!--"
  43. COMMENT_END :: "-->"
  44. /*
  45. Default: CDATA and comments are passed through unchanged.
  46. */
  47. XML_Decode_Option :: enum u8 {
  48. /*
  49. Do not decode & entities. It decodes by default.
  50. If given, overrides `Decode_CDATA`.
  51. */
  52. No_Entity_Decode,
  53. /*
  54. CDATA is unboxed.
  55. */
  56. Unbox_CDATA,
  57. /*
  58. Unboxed CDATA is decoded as well.
  59. Ignored if `.Unbox_CDATA` is not given.
  60. */
  61. Decode_CDATA,
  62. /*
  63. Comments are stripped.
  64. */
  65. Comment_Strip,
  66. }
  67. XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
  68. /*
  69. Decode a string that may include SGML/XML/HTML entities.
  70. The caller has to free the result.
  71. */
  72. decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
  73. context.allocator = allocator
  74. l := len(input)
  75. if l == 0 { return "", .None }
  76. builder := strings.builder_make()
  77. defer strings.builder_destroy(&builder)
  78. t := Tokenizer{src=input}
  79. in_data := false
  80. loop: for {
  81. advance(&t) or_return
  82. if t.r < 0 { break loop }
  83. /*
  84. Below here we're never inside a CDATA tag.
  85. At most we'll see the start of one, but that doesn't affect the logic.
  86. */
  87. switch t.r {
  88. case '<':
  89. /*
  90. Might be the start of a CDATA tag or comment.
  91. We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
  92. it couldn't have been part of an XML tag body to be decoded here.
  93. Keep in mind that we could already *be* inside a CDATA tag.
  94. If so, write `>` as a literal and continue.
  95. */
  96. if in_data {
  97. write_rune(&builder, '<')
  98. continue
  99. }
  100. in_data = _handle_xml_special(&t, &builder, options) or_return
  101. case ']':
  102. /*
  103. If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
  104. */
  105. if in_data {
  106. if t.read_offset + len(CDATA_END) < len(t.src) {
  107. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  108. in_data = false
  109. t.read_offset += len(CDATA_END) - 1
  110. }
  111. }
  112. continue
  113. } else {
  114. write_rune(&builder, ']')
  115. }
  116. case:
  117. if in_data && .Decode_CDATA not_in options {
  118. /*
  119. Unboxed, but undecoded.
  120. */
  121. write_rune(&builder, t.r)
  122. continue
  123. }
  124. if t.r == '&' {
  125. if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
  126. /*
  127. We read to the end of the string without closing the entity.
  128. Pass through as-is.
  129. */
  130. write_string(&builder, entity)
  131. } else {
  132. if .No_Entity_Decode not_in options {
  133. if decoded, ok := xml_decode_entity(entity); ok {
  134. write_rune(&builder, decoded)
  135. continue
  136. }
  137. }
  138. /*
  139. Literal passthrough because the decode failed or we want entities not decoded.
  140. */
  141. write_string(&builder, "&")
  142. write_string(&builder, entity)
  143. write_string(&builder, ";")
  144. }
  145. } else {
  146. write_rune(&builder, t.r)
  147. }
  148. }
  149. }
  150. return strings.clone(strings.to_string(builder), allocator), err
  151. }
  152. advance :: proc(t: ^Tokenizer) -> (err: Error) {
  153. if t == nil { return .Tokenizer_Is_Nil }
  154. #no_bounds_check {
  155. if t.read_offset < len(t.src) {
  156. t.offset = t.read_offset
  157. t.r, t.w = rune(t.src[t.read_offset]), 1
  158. switch {
  159. case t.r == 0:
  160. return .Illegal_NUL_Character
  161. case t.r >= utf8.RUNE_SELF:
  162. t.r, t.w = utf8.decode_rune_in_string(t.src[t.read_offset:])
  163. if t.r == utf8.RUNE_ERROR && t.w == 1 {
  164. return .Illegal_UTF_Encoding
  165. } else if t.r == utf8.RUNE_BOM && t.offset > 0 {
  166. return .Illegal_BOM
  167. }
  168. }
  169. t.read_offset += t.w
  170. return .None
  171. } else {
  172. t.offset = len(t.src)
  173. t.r = -1
  174. return
  175. }
  176. }
  177. }
  178. xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
  179. entity := entity
  180. if len(entity) == 0 { return -1, false }
  181. switch entity[0] {
  182. case '#':
  183. base := 10
  184. val := 0
  185. entity = entity[1:]
  186. if len(entity) == 0 { return -1, false }
  187. if entity[0] == 'x' || entity[0] == 'X' {
  188. base = 16
  189. entity = entity[1:]
  190. }
  191. for len(entity) > 0 {
  192. r := entity[0]
  193. switch r {
  194. case '0'..='9':
  195. val *= base
  196. val += int(r - '0')
  197. case 'a'..='f':
  198. if base == 10 { return -1, false }
  199. val *= base
  200. val += int(r - 'a' + 10)
  201. case 'A'..='F':
  202. if base == 10 { return -1, false }
  203. val *= base
  204. val += int(r - 'A' + 10)
  205. case:
  206. return -1, false
  207. }
  208. if val > MAX_RUNE_CODEPOINT { return -1, false }
  209. entity = entity[1:]
  210. }
  211. return rune(val), true
  212. case:
  213. /*
  214. Named entity.
  215. */
  216. return named_xml_entity_to_rune(entity)
  217. }
  218. }
  219. /*
  220. Private XML helper to extract `&<stuff>;` entity.
  221. */
  222. @(private="file")
  223. _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
  224. assert(t != nil && t.r == '&')
  225. /*
  226. All of these would be in the ASCII range.
  227. Even if one is not, it doesn't matter. All characters we need to compare to extract are.
  228. */
  229. length := len(t.src)
  230. found := false
  231. #no_bounds_check {
  232. for t.read_offset < length {
  233. if t.src[t.read_offset] == ';' {
  234. t.read_offset += 1
  235. found = true
  236. break
  237. }
  238. t.read_offset += 1
  239. }
  240. }
  241. if found {
  242. return string(t.src[t.offset + 1 : t.read_offset - 1]), .None
  243. }
  244. return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding
  245. }
  246. /*
  247. Private XML helper for CDATA and comments.
  248. */
  249. @(private="file")
  250. _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
  251. assert(t != nil && t.r == '<')
  252. if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
  253. if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
  254. t.read_offset += len(CDATA_START) - 1
  255. if .Unbox_CDATA in options && .Decode_CDATA in options {
  256. /*
  257. We're unboxing _and_ decoding CDATA
  258. */
  259. return true, .None
  260. }
  261. /*
  262. CDATA is passed through.
  263. */
  264. offset := t.offset
  265. /*
  266. Scan until end of CDATA.
  267. */
  268. for {
  269. advance(t) or_return
  270. if t.r < 0 { return true, .CDATA_Not_Terminated }
  271. if t.read_offset + len(CDATA_END) < len(t.src) {
  272. if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
  273. t.read_offset += len(CDATA_END) - 1
  274. cdata := string(t.src[offset : t.read_offset])
  275. if .Unbox_CDATA in options {
  276. cdata = cdata[len(CDATA_START):]
  277. cdata = cdata[:len(cdata) - len(CDATA_END)]
  278. }
  279. write_string(builder, cdata)
  280. return false, .None
  281. }
  282. }
  283. }
  284. } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
  285. t.read_offset += len(COMMENT_START)
  286. /*
  287. Comment is passed through by default.
  288. */
  289. offset := t.offset
  290. /*
  291. Scan until end of Comment.
  292. */
  293. for {
  294. advance(t) or_return
  295. if t.r < 0 { return true, .Comment_Not_Terminated }
  296. if t.read_offset + len(COMMENT_END) < len(t.src) {
  297. if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
  298. t.read_offset += len(COMMENT_END) - 1
  299. if .Comment_Strip not_in options {
  300. comment := string(t.src[offset : t.read_offset])
  301. write_string(builder, comment)
  302. }
  303. return false, .None
  304. }
  305. }
  306. }
  307. }
  308. return false, .None
  309. }