generate_entity_table.odin 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. package xml_example
  2. import "core:encoding/xml"
  3. import "core:os"
  4. import "core:path"
  5. import "core:mem"
  6. import "core:strings"
  7. import "core:strconv"
  8. import "core:slice"
  9. import "core:fmt"
  10. /*
  11. Silent error handler for the parser.
  12. */
  13. Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
  14. OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
  15. Entity :: struct {
  16. name: string,
  17. codepoint: rune,
  18. description: string,
  19. }
  20. generate_encoding_entity_table :: proc() {
  21. using fmt
  22. filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
  23. defer delete(filename)
  24. generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
  25. defer delete(generated_filename)
  26. doc, err := xml.parse(filename, OPTIONS, Error_Handler)
  27. defer xml.destroy(doc)
  28. if err != .None {
  29. printf("Load/Parse error: %v\n", err)
  30. if err == .File_Error {
  31. printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
  32. }
  33. os.exit(1)
  34. }
  35. printf("\"%v\" loaded and parsed.\n", filename)
  36. generated_buf: strings.Builder
  37. defer strings.builder_destroy(&generated_buf)
  38. w := strings.to_writer(&generated_buf)
  39. charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
  40. if !charlist_ok {
  41. eprintln("Could not locate top-level `<charlist>` tag.")
  42. os.exit(1)
  43. }
  44. printf("Found `<charlist>` with %v children.\n", len(charlist.children))
  45. entity_map: map[string]Entity
  46. names: [dynamic]string
  47. min_name_length := max(int)
  48. max_name_length := min(int)
  49. shortest_name: string
  50. longest_name: string
  51. count := 0
  52. for char in charlist.children {
  53. if char.ident != "character" {
  54. eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
  55. os.exit(1)
  56. }
  57. if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
  58. eprintln("`<character id=\"...\">` attribute not found.")
  59. os.exit(1)
  60. } else {
  61. codepoint := strconv.atoi(codepoint_string)
  62. desc, desc_ok := xml.find_child_by_ident(char, "description")
  63. description := desc.value if desc_ok else ""
  64. /*
  65. For us to be interested in this codepoint, it has to have at least one entity.
  66. */
  67. nth := 0
  68. for {
  69. character_entity, entity_ok := xml.find_child_by_ident(char, "entity", nth)
  70. if !entity_ok { break }
  71. nth += 1
  72. if name, name_ok := xml.find_attribute_val_by_key(character_entity, "id"); name_ok {
  73. if len(name) == 0 {
  74. /*
  75. Invalid name. Skip.
  76. */
  77. continue
  78. }
  79. if name == "\"\"" {
  80. printf("%#v\n", char)
  81. printf("%#v\n", character_entity)
  82. }
  83. if len(name) > max_name_length { longest_name = name }
  84. if len(name) < min_name_length { shortest_name = name }
  85. min_name_length = min(min_name_length, len(name))
  86. max_name_length = max(max_name_length, len(name))
  87. e := Entity{
  88. name = name,
  89. codepoint = rune(codepoint),
  90. description = description,
  91. }
  92. if _, seen := entity_map[name]; seen {
  93. continue
  94. }
  95. entity_map[name] = e
  96. append(&names, name)
  97. count += 1
  98. }
  99. }
  100. }
  101. }
  102. /*
  103. Sort by name.
  104. */
  105. slice.sort(names[:])
  106. printf("Found %v unique `&name;` -> rune mappings.\n", count)
  107. printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
  108. printf("Longest name: %v (%v)\n", longest_name, max_name_length)
  109. // println(rune_to_string(1234))
  110. /*
  111. Generate table.
  112. */
  113. wprintln(w, "package unicode_entity")
  114. wprintln(w, "")
  115. wprintln(w, GENERATED)
  116. wprintln(w, "")
  117. wprintf (w, TABLE_FILE_PROLOG)
  118. wprintln(w, "")
  119. wprintf (w, "// `&%v;`\n", shortest_name)
  120. wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
  121. wprintf (w, "// `&%v;`\n", longest_name)
  122. wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
  123. wprintln(w, "")
  124. wprintln(w,
  125. `
  126. /*
  127. Input:
  128. entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
  129. Output:
  130. "decoded" - The decoded rune if found by name, or -1 otherwise.
  131. "ok" - true if found, false if not.
  132. IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
  133. */
  134. named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
  135. /*
  136. Early out if the name is too short or too long.
  137. min as a precaution in case the generated table has a bogus value.
  138. */
  139. if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
  140. return -1, false
  141. }
  142. switch rune(name[0]) {
  143. `)
  144. prefix := '?'
  145. should_close := false
  146. for v in names {
  147. if rune(v[0]) != prefix {
  148. if should_close {
  149. wprintln(w, "\t\t}\n")
  150. }
  151. prefix = rune(v[0])
  152. wprintf (w, "\tcase '%v':\n", prefix)
  153. wprintln(w, "\t\tswitch name {")
  154. }
  155. e := entity_map[v]
  156. wprintf(w, "\t\t\tcase \"%v\": \n", e.name)
  157. wprintf(w, "\t\t\t\t// %v\n", e.description)
  158. wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
  159. should_close = true
  160. }
  161. wprintln(w, "\t\t}")
  162. wprintln(w, "\t}")
  163. wprintln(w, "\treturn -1, false")
  164. wprintln(w, "}\n")
  165. wprintln(w, GENERATED)
  166. println()
  167. println(strings.to_string(generated_buf))
  168. println()
  169. written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
  170. if written {
  171. fmt.printf("Successfully written generated \"%v\".", generated_filename)
  172. } else {
  173. fmt.printf("Failed to write generated \"%v\".", generated_filename)
  174. }
  175. delete(entity_map)
  176. delete(names)
  177. for name in &names {
  178. free(&name)
  179. }
  180. }
  181. GENERATED :: `/*
  182. ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
  183. */`
  184. TABLE_FILE_PROLOG :: `/*
  185. This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
  186. UPDATE:
  187. - Ensure the XML file was downloaded using "tests\core\download_assets.py".
  188. - Run "core/unicode/tools/generate_entity_table.odin"
  189. Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
  190. Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
  191. European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
  192. All Rights Reserved.
  193. This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
  194. but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  195. [1] http://www.w3.org/Consortium/Legal/copyright-software
  196. See also: LICENSE_table.md
  197. */
  198. `
  199. rune_to_string :: proc(r: rune) -> (res: string) {
  200. res = fmt.tprintf("%08x", int(r))
  201. for len(res) > 2 && res[:2] == "00" {
  202. res = res[2:]
  203. }
  204. return fmt.tprintf("rune(0x%v)", res)
  205. }
  206. is_dotted_name :: proc(name: string) -> (dotted: bool) {
  207. for r in name {
  208. if r == '.' { return true}
  209. }
  210. return false
  211. }
  212. main :: proc() {
  213. using fmt
  214. track: mem.Tracking_Allocator
  215. mem.tracking_allocator_init(&track, context.allocator)
  216. context.allocator = mem.tracking_allocator(&track)
  217. generate_encoding_entity_table()
  218. if len(track.allocation_map) > 0 {
  219. println()
  220. for _, v in track.allocation_map {
  221. printf("%v Leaked %v bytes.\n", v.location, v.size)
  222. }
  223. }
  224. println("Done and cleaned up!")
  225. }