generate_entity_table.odin 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. package xml_example
  2. import "core:encoding/xml"
  3. import "core:os"
  4. import path "core:path/filepath"
  5. import "core:mem"
  6. import "core:strings"
  7. import "core:strconv"
  8. import "core:slice"
  9. import "core:fmt"
  10. /*
  11. Silent error handler for the parser.
  12. */
  13. Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
  14. OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
  15. Entity :: struct {
  16. name: string,
  17. codepoint: rune,
  18. description: string,
  19. }
  20. generate_encoding_entity_table :: proc() {
  21. filename := path.join({ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml"})
  22. defer delete(filename)
  23. generated_filename := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"})
  24. defer delete(generated_filename)
  25. doc, err := xml.load_from_file(filename, OPTIONS, Error_Handler)
  26. defer xml.destroy(doc)
  27. if err != .None {
  28. fmt.printf("Load/Parse error: %v\n", err)
  29. if err == .File_Error {
  30. fmt.printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
  31. }
  32. os.exit(1)
  33. }
  34. fmt.printf("\"%v\" loaded and parsed.\n", filename)
  35. generated_buf: strings.Builder
  36. defer strings.builder_destroy(&generated_buf)
  37. w := strings.to_writer(&generated_buf)
  38. charlist_id, charlist_ok := xml.find_child_by_ident(doc, 0, "charlist")
  39. if !charlist_ok {
  40. fmt.eprintln("Could not locate top-level `<charlist>` tag.")
  41. os.exit(1)
  42. }
  43. charlist := doc.elements[charlist_id]
  44. fmt.printf("Found `<charlist>` with %v children.\n", len(charlist.value))
  45. entity_map: map[string]Entity
  46. names: [dynamic]string
  47. min_name_length := max(int)
  48. max_name_length := min(int)
  49. shortest_name: string
  50. longest_name: string
  51. count := 0
  52. for char_id in charlist.value {
  53. id := char_id.(xml.Element_ID)
  54. char := doc.elements[id]
  55. if char.ident != "character" {
  56. fmt.eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
  57. os.exit(1)
  58. }
  59. if codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec"); !ok {
  60. fmt.eprintln("`<character id=\"...\">` attribute not found.")
  61. os.exit(1)
  62. } else {
  63. codepoint := strconv.atoi(codepoint_string)
  64. desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
  65. description := ""
  66. if len(doc.elements[desc].value) == 1 {
  67. description = doc.elements[desc].value[0].(string)
  68. }
  69. /*
  70. For us to be interested in this codepoint, it has to have at least one entity.
  71. */
  72. nth := 0
  73. for {
  74. character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
  75. nth += 1
  76. name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
  77. if len(name) == 0 {
  78. /*
  79. Invalid name. Skip.
  80. */
  81. continue
  82. }
  83. if name == "\"\"" {
  84. fmt.printf("%#v\n", char)
  85. fmt.printf("%#v\n", character_entity)
  86. }
  87. if len(name) > max_name_length { longest_name = name }
  88. if len(name) < min_name_length { shortest_name = name }
  89. min_name_length = min(min_name_length, len(name))
  90. max_name_length = max(max_name_length, len(name))
  91. e := Entity{
  92. name = name,
  93. codepoint = rune(codepoint),
  94. description = description,
  95. }
  96. if name in entity_map {
  97. continue
  98. }
  99. entity_map[name] = e
  100. append(&names, name)
  101. count += 1
  102. }
  103. }
  104. }
  105. /*
  106. Sort by name.
  107. */
  108. slice.sort(names[:])
  109. fmt.printf("Found %v unique `&name;` -> rune mappings.\n", count)
  110. fmt.printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
  111. fmt.printf("Longest name: %v (%v)\n", longest_name, max_name_length)
  112. /*
  113. Generate table.
  114. */
  115. fmt.wprintln(w, "package encoding_unicode_entity")
  116. fmt.wprintln(w, "")
  117. fmt.wprintln(w, GENERATED)
  118. fmt.wprintln(w, "")
  119. fmt.wprintf (w, TABLE_FILE_PROLOG)
  120. fmt.wprintln(w, "")
  121. fmt.wprintf (w, "// `&%v;`\n", shortest_name)
  122. fmt.wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
  123. fmt.wprintf (w, "// `&%v;`\n", longest_name)
  124. fmt.wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
  125. fmt.wprintln(w, "")
  126. fmt.wprintln(w,
  127. `
  128. /*
  129. Input:
  130. entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
  131. Output:
  132. "decoded" - The decoded rune if found by name, or -1 otherwise.
  133. "ok" - true if found, false if not.
  134. IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
  135. */
  136. named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
  137. /*
  138. Early out if the name is too short or too long.
  139. min as a precaution in case the generated table has a bogus value.
  140. */
  141. if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
  142. return -1, false
  143. }
  144. switch rune(name[0]) {
  145. `)
  146. prefix := '?'
  147. should_close := false
  148. for v in names {
  149. if rune(v[0]) != prefix {
  150. if should_close {
  151. fmt.wprintln(w, "\t\t}\n")
  152. }
  153. prefix = rune(v[0])
  154. fmt.wprintf (w, "\tcase '%v':\n", prefix)
  155. fmt.wprintln(w, "\t\tswitch name {")
  156. }
  157. e := entity_map[v]
  158. fmt.wprintf(w, "\t\tcase \"%v\":", e.name)
  159. for i := len(e.name); i < max_name_length; i += 1 {
  160. fmt.wprintf(w, " ")
  161. }
  162. fmt.wprintf(w, " // %v\n", e.description)
  163. fmt.wprintf(w, "\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
  164. should_close = true
  165. }
  166. fmt.wprintln(w, "\t\t}")
  167. fmt.wprintln(w, "\t}")
  168. fmt.wprintln(w, "\treturn -1, false")
  169. fmt.wprintln(w, "}\n")
  170. fmt.wprintln(w, GENERATED)
  171. fmt.println()
  172. fmt.println(strings.to_string(generated_buf))
  173. fmt.println()
  174. written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
  175. if written {
  176. fmt.printf("Successfully written generated \"%v\".\n", generated_filename)
  177. } else {
  178. fmt.printf("Failed to write generated \"%v\".\n", generated_filename)
  179. }
  180. delete(entity_map)
  181. delete(names)
  182. for &name in names {
  183. free(&name)
  184. }
  185. }
  186. GENERATED :: `/*
  187. ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
  188. */`
  189. TABLE_FILE_PROLOG :: `/*
  190. This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
  191. UPDATE:
  192. - Ensure the XML file was downloaded using "tests\core\download_assets.py".
  193. - Run "core/unicode/tools/generate_entity_table.odin"
  194. Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
  195. Copyright David Carlisle 1999-2023
  196. Use and distribution of this code are permitted under the terms of the
  197. W3C Software Notice and License.
  198. http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
  199. This file is a collection of information about how to map
  200. Unicode entities to LaTeX, and various SGML/XML entity
  201. sets (ISO and MathML/HTML). A Unicode character may be mapped
  202. to several entities.
  203. Originally designed by Sebastian Rahtz in conjunction with
  204. Barbara Beeton for the STIX project
  205. See also: LICENSE_table.md
  206. */
  207. `
  208. rune_to_string :: proc(r: rune) -> (res: string) {
  209. res = fmt.tprintf("%08x", int(r))
  210. for len(res) > 2 && res[:2] == "00" {
  211. res = res[2:]
  212. }
  213. return fmt.tprintf("rune(0x%v)", res)
  214. }
  215. is_dotted_name :: proc(name: string) -> (dotted: bool) {
  216. for r in name {
  217. if r == '.' { return true}
  218. }
  219. return false
  220. }
  221. main :: proc() {
  222. track: mem.Tracking_Allocator
  223. mem.tracking_allocator_init(&track, context.allocator)
  224. context.allocator = mem.tracking_allocator(&track)
  225. generate_encoding_entity_table()
  226. if len(track.allocation_map) > 0 {
  227. fmt.println()
  228. for _, v in track.allocation_map {
  229. fmt.printf("%v Leaked %v bytes.\n", v.location, v.size)
  230. }
  231. }
  232. fmt.println("Done and cleaned up!")
  233. }