generate_entity_table.odin 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. package xml_example
  2. import "core:encoding/xml"
  3. import "core:os"
  4. import "core:path"
  5. import "core:mem"
  6. import "core:strings"
  7. import "core:strconv"
  8. import "core:slice"
  9. import "core:fmt"
  10. /*
  11. Silent error handler for the parser.
  12. */
  13. Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
  14. OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
  15. Entity :: struct {
  16. name: string,
  17. codepoint: rune,
  18. description: string,
  19. }
  20. generate_encoding_entity_table :: proc() {
  21. using fmt
  22. filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
  23. defer delete(filename)
  24. generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
  25. defer delete(generated_filename)
  26. doc, err := xml.parse(filename, OPTIONS, Error_Handler)
  27. defer xml.destroy(doc)
  28. if err != .None {
  29. printf("Load/Parse error: %v\n", err)
  30. if err == .File_Error {
  31. printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
  32. }
  33. os.exit(1)
  34. }
  35. printf("\"%v\" loaded and parsed.\n", filename)
  36. generated_buf: strings.Builder
  37. defer strings.builder_destroy(&generated_buf)
  38. w := strings.to_writer(&generated_buf)
  39. charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
  40. if !charlist_ok {
  41. eprintln("Could not locate top-level `<charlist>` tag.")
  42. os.exit(1)
  43. }
  44. printf("Found `<charlist>` with %v children.\n", len(charlist.children))
  45. entity_map: map[string]Entity
  46. names: [dynamic]string
  47. min_name_length := max(int)
  48. max_name_length := min(int)
  49. shortest_name: string
  50. longest_name: string
  51. count := 0
  52. for char in charlist.children {
  53. if char.ident != "character" {
  54. eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
  55. os.exit(1)
  56. }
  57. if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
  58. eprintln("`<character id=\"...\">` attribute not found.")
  59. os.exit(1)
  60. } else {
  61. codepoint := strconv.atoi(codepoint_string)
  62. desc, desc_ok := xml.find_child_by_ident(char, "description")
  63. description := desc.value if desc_ok else ""
  64. /*
  65. For us to be interested in this codepoint, it has to have at least one entity.
  66. */
  67. nth := 0
  68. for {
  69. character_entity := xml.find_child_by_ident(char, "entity", nth) or_break
  70. nth += 1
  71. name := xml.find_attribute_val_by_key(character_entity, "id") or_continue
  72. if len(name) == 0 {
  73. /*
  74. Invalid name. Skip.
  75. */
  76. continue
  77. }
  78. if name == "\"\"" {
  79. printf("%#v\n", char)
  80. printf("%#v\n", character_entity)
  81. }
  82. if len(name) > max_name_length { longest_name = name }
  83. if len(name) < min_name_length { shortest_name = name }
  84. min_name_length = min(min_name_length, len(name))
  85. max_name_length = max(max_name_length, len(name))
  86. e := Entity{
  87. name = name,
  88. codepoint = rune(codepoint),
  89. description = description,
  90. }
  91. if name in entity_map {
  92. continue
  93. }
  94. entity_map[name] = e
  95. append(&names, name)
  96. count += 1
  97. }
  98. }
  99. }
  100. /*
  101. Sort by name.
  102. */
  103. slice.sort(names[:])
  104. printf("Found %v unique `&name;` -> rune mappings.\n", count)
  105. printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
  106. printf("Longest name: %v (%v)\n", longest_name, max_name_length)
  107. // println(rune_to_string(1234))
  108. /*
  109. Generate table.
  110. */
  111. wprintln(w, "package unicode_entity")
  112. wprintln(w, "")
  113. wprintln(w, GENERATED)
  114. wprintln(w, "")
  115. wprintf (w, TABLE_FILE_PROLOG)
  116. wprintln(w, "")
  117. wprintf (w, "// `&%v;`\n", shortest_name)
  118. wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
  119. wprintf (w, "// `&%v;`\n", longest_name)
  120. wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
  121. wprintln(w, "")
  122. wprintln(w,
  123. `
  124. /*
  125. Input:
  126. entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
  127. Output:
  128. "decoded" - The decoded rune if found by name, or -1 otherwise.
  129. "ok" - true if found, false if not.
  130. IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
  131. */
  132. named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
  133. /*
  134. Early out if the name is too short or too long.
  135. min as a precaution in case the generated table has a bogus value.
  136. */
  137. if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
  138. return -1, false
  139. }
  140. switch rune(name[0]) {
  141. `)
  142. prefix := '?'
  143. should_close := false
  144. for v in names {
  145. if rune(v[0]) != prefix {
  146. if should_close {
  147. wprintln(w, "\t\t}\n")
  148. }
  149. prefix = rune(v[0])
  150. wprintf (w, "\tcase '%v':\n", prefix)
  151. wprintln(w, "\t\tswitch name {")
  152. }
  153. e := entity_map[v]
  154. wprintf(w, "\t\t\tcase \"%v\": \n", e.name)
  155. wprintf(w, "\t\t\t\t// %v\n", e.description)
  156. wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
  157. should_close = true
  158. }
  159. wprintln(w, "\t\t}")
  160. wprintln(w, "\t}")
  161. wprintln(w, "\treturn -1, false")
  162. wprintln(w, "}\n")
  163. wprintln(w, GENERATED)
  164. println()
  165. println(strings.to_string(generated_buf))
  166. println()
  167. written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
  168. if written {
  169. fmt.printf("Successfully written generated \"%v\".", generated_filename)
  170. } else {
  171. fmt.printf("Failed to write generated \"%v\".", generated_filename)
  172. }
  173. delete(entity_map)
  174. delete(names)
  175. for &name in names {
  176. free(&name)
  177. }
  178. }
  179. GENERATED :: `/*
  180. ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
  181. */`
  182. TABLE_FILE_PROLOG :: `/*
  183. This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
  184. UPDATE:
  185. - Ensure the XML file was downloaded using "tests\core\download_assets.py".
  186. - Run "core/unicode/tools/generate_entity_table.odin"
  187. Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
  188. Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
  189. European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
  190. All Rights Reserved.
  191. This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
  192. but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  193. [1] http://www.w3.org/Consortium/Legal/copyright-software
  194. See also: LICENSE_table.md
  195. */
  196. `
  197. rune_to_string :: proc(r: rune) -> (res: string) {
  198. res = fmt.tprintf("%08x", int(r))
  199. for len(res) > 2 && res[:2] == "00" {
  200. res = res[2:]
  201. }
  202. return fmt.tprintf("rune(0x%v)", res)
  203. }
  204. is_dotted_name :: proc(name: string) -> (dotted: bool) {
  205. for r in name {
  206. if r == '.' { return true}
  207. }
  208. return false
  209. }
  210. main :: proc() {
  211. using fmt
  212. track: mem.Tracking_Allocator
  213. mem.tracking_allocator_init(&track, context.allocator)
  214. context.allocator = mem.tracking_allocator(&track)
  215. generate_encoding_entity_table()
  216. if len(track.allocation_map) > 0 {
  217. println()
  218. for _, v in track.allocation_map {
  219. printf("%v Leaked %v bytes.\n", v.location, v.size)
  220. }
  221. }
  222. println("Done and cleaned up!")
  223. }