grapheme.odin 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. package utf8
  2. import "core:unicode"
  3. ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER
  4. is_control :: unicode.is_control
  5. is_hangul_syllable_leading :: unicode.is_hangul_syllable_leading
  6. is_hangul_syllable_vowel :: unicode.is_hangul_syllable_vowel
  7. is_hangul_syllable_trailing :: unicode.is_hangul_syllable_trailing
  8. is_hangul_syllable_lv :: unicode.is_hangul_syllable_lv
  9. is_hangul_syllable_lvt :: unicode.is_hangul_syllable_lvt
  10. is_indic_conjunct_break_extend :: unicode.is_indic_conjunct_break_extend
  11. is_indic_conjunct_break_linker :: unicode.is_indic_conjunct_break_linker
  12. is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
  13. is_gcb_extend_class :: unicode.is_gcb_extend_class
  14. is_spacing_mark :: unicode.is_spacing_mark
  15. is_gcb_prepend_class :: unicode.is_gcb_prepend_class
  16. is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
  17. is_regional_indicator :: unicode.is_regional_indicator
  18. normalized_east_asian_width :: unicode.normalized_east_asian_width
  19. Grapheme :: struct {
  20. byte_index: int,
  21. rune_index: int,
  22. width: int,
  23. }
  24. /*
  25. Count the individual graphemes in a UTF-8 string.
  26. Inputs:
  27. - str: The input string.
  28. Returns:
  29. - graphemes: The number of graphemes in the string.
  30. - runes: The number of runes in the string.
  31. - width: The width of the string in number of monospace cells.
  32. */
  33. @(require_results)
  34. grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
  35. _, graphemes, runes, width = decode_grapheme_clusters(str, false)
  36. return
  37. }
  38. /*
  39. Decode the individual graphemes in a UTF-8 string.
  40. *Allocates Using Provided Allocator*
  41. Inputs:
  42. - str: The input string.
  43. - track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
  44. - allocator: (default: context.allocator)
  45. Returns:
  46. - graphemes: Extra data about each grapheme.
  47. - grapheme_count: The number of graphemes in the string.
  48. - rune_count: The number of runes in the string.
  49. - width: The width of the string in number of monospace cells.
  50. */
  51. @(require_results)
  52. decode_grapheme_clusters :: proc(
  53. str: string,
  54. track_graphemes := true,
  55. allocator := context.allocator,
  56. ) -> (
  57. graphemes: [dynamic]Grapheme,
  58. grapheme_count: int,
  59. rune_count: int,
  60. width: int,
  61. ) {
  62. // The following procedure implements text segmentation by breaking on
  63. // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
  64. // the Unicode® Standard Annex #29, entitled:
  65. //
  66. // UNICODE TEXT SEGMENTATION
  67. //
  68. // Version: Unicode 15.1.0
  69. // Date: 2023-08-16
  70. // Revision: 43
  71. //
  72. // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
  73. // extended, non-legacy ruleset.
  74. //
  75. // Please see the references below for more information.
  76. //
  77. //
  78. // NOTE(Feoramund): This procedure has not been highly optimized.
  79. // A couple opportunities were taken to bypass repeated checking when a
  80. // rune is outside of certain codepoint ranges, but little else has been
  81. // done. Standard switches, conditionals, and binary search are used to
  82. // see if a rune fits into a certain category.
  83. //
  84. // I did find that only one prior rune of state was necessary to build an
  85. // algorithm that successfully passes all 4,835 test cases provided with
  86. // this implementation from the Unicode organization's website.
  87. //
  88. // My initial implementation tracked explicit breaks and counted them once
  89. // the string iteration had terminated. I've found this current
  90. // implementation to be far simpler and need no allocations (unless the
  91. // caller wants position data).
  92. //
  93. // Most rules work backwards instead of forwards which has helped keep this
  94. // simple, despite its length and verbosity.
  95. //
  96. //
  97. // The implementation has been left verbose and in the order described by
  98. // the specification, to enable better readability and future upkeep.
  99. //
  100. // Some possible optimizations might include:
  101. //
  102. // - saving the type of `last_rune` instead of the exact rune.
  103. // - reordering rules.
  104. // - combining tables.
  105. //
  106. //
  107. // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
  108. // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
  109. // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
  110. // [4]: https://www.unicode.org/reports/tr29/#Conformance
  111. // Additionally, this procedure now takes into account Standard Annex #11,
  112. // in order to estimate how visually wide the string will appear on a
  113. // monospaced display. This can only ever be a rough guess, as this tends
  114. // to be an implementation detail relating to which fonts are being used,
  115. // how codepoints are interpreted and drawn, if codepoint sequences are
  116. // interpreted correctly, and et cetera.
  117. //
  118. // For example, a program may not properly interpret an emoji modifier
  119. // sequence and print the component glyphs instead of one whole glyph.
  120. //
  121. // See here for more information: https://www.unicode.org/reports/tr11/
  122. //
  123. // NOTE: There is no explicit mention of what to do with zero-width spaces
  124. // as far as grapheme cluster segmentation goes, therefore this
  125. // implementation may count and return graphemes with a `width` of zero.
  126. //
  127. // Treat them as any other space.
  128. Grapheme_Cluster_Sequence :: enum {
  129. None,
  130. Indic,
  131. Emoji,
  132. Regional,
  133. }
  134. context.allocator = allocator
  135. last_rune: rune
  136. last_rune_breaks_forward: bool
  137. last_width: int
  138. last_grapheme_count: int
  139. bypass_next_rune: bool
  140. regional_indicator_counter: int
  141. current_sequence: Grapheme_Cluster_Sequence
  142. continue_sequence: bool
  143. for this_rune, byte_index in str {
  144. defer {
  145. // "Break at the start and end of text, unless the text is empty."
  146. //
  147. // GB1: sot ÷ Any
  148. // GB2: Any ÷ eot
  149. if rune_count == 0 && grapheme_count == 0 {
  150. grapheme_count += 1
  151. }
  152. if grapheme_count > last_grapheme_count {
  153. width += normalized_east_asian_width(this_rune)
  154. if track_graphemes {
  155. append(&graphemes, Grapheme{
  156. byte_index,
  157. rune_count,
  158. width - last_width,
  159. })
  160. }
  161. last_grapheme_count = grapheme_count
  162. last_width = width
  163. }
  164. last_rune = this_rune
  165. rune_count += 1
  166. if !continue_sequence {
  167. current_sequence = .None
  168. regional_indicator_counter = 0
  169. }
  170. continue_sequence = false
  171. }
  172. // "Do not break between a CR and LF. Otherwise, break before and after controls."
  173. //
  174. // GB3: CR × LF
  175. // GB4: (Control | CR | LF) ÷
  176. // GB5: ÷ (Control | CR | LF)
  177. if this_rune == '\n' && last_rune == '\r' {
  178. last_rune_breaks_forward = false
  179. bypass_next_rune = false
  180. continue
  181. }
  182. if is_control(this_rune) {
  183. grapheme_count += 1
  184. last_rune_breaks_forward = true
  185. bypass_next_rune = true
  186. continue
  187. }
  188. // (This check is for rules that work forwards, instead of backwards.)
  189. if bypass_next_rune {
  190. if last_rune_breaks_forward {
  191. grapheme_count += 1
  192. last_rune_breaks_forward = false
  193. }
  194. bypass_next_rune = false
  195. continue
  196. }
  197. // (Optimization 1: Prevent low runes from proceeding further.)
  198. //
  199. // * 0xA9 and 0xAE are in the Extended_Pictographic range,
  200. // which is checked later in GB11.
  201. if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
  202. grapheme_count += 1
  203. continue
  204. }
  205. // (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
  206. if 0x1100 <= this_rune && this_rune <= 0xD7FB {
  207. // "Do not break Hangul syllable sequences."
  208. //
  209. // GB6: L × (L | V | LV | LVT)
  210. // GB7: (LV | V) × (V | T)
  211. // GB8: (LVT | T) × T
  212. if is_hangul_syllable_leading(this_rune) ||
  213. is_hangul_syllable_lv(this_rune) ||
  214. is_hangul_syllable_lvt(this_rune) {
  215. if !is_hangul_syllable_leading(last_rune) {
  216. grapheme_count += 1
  217. }
  218. continue
  219. }
  220. if is_hangul_syllable_vowel(this_rune) {
  221. if is_hangul_syllable_leading(last_rune) ||
  222. is_hangul_syllable_vowel(last_rune) ||
  223. is_hangul_syllable_lv(last_rune) {
  224. continue
  225. }
  226. grapheme_count += 1
  227. continue
  228. }
  229. if is_hangul_syllable_trailing(this_rune) {
  230. if is_hangul_syllable_trailing(last_rune) ||
  231. is_hangul_syllable_lvt(last_rune) ||
  232. is_hangul_syllable_lv(last_rune) ||
  233. is_hangul_syllable_vowel(last_rune) {
  234. continue
  235. }
  236. grapheme_count += 1
  237. continue
  238. }
  239. }
  240. // "Do not break before extending characters or ZWJ."
  241. //
  242. // GB9: × (Extend | ZWJ)
  243. if this_rune == ZERO_WIDTH_JOINER {
  244. continue_sequence = true
  245. continue
  246. }
  247. if is_gcb_extend_class(this_rune) {
  248. // (Support for GB9c.)
  249. if current_sequence == .Indic {
  250. if is_indic_conjunct_break_extend(this_rune) && (
  251. is_indic_conjunct_break_linker(last_rune) ||
  252. is_indic_conjunct_break_consonant(last_rune) ) {
  253. continue_sequence = true
  254. continue
  255. }
  256. if is_indic_conjunct_break_linker(this_rune) && (
  257. is_indic_conjunct_break_linker(last_rune) ||
  258. is_indic_conjunct_break_extend(last_rune) ||
  259. is_indic_conjunct_break_consonant(last_rune) ) {
  260. continue_sequence = true
  261. continue
  262. }
  263. continue
  264. }
  265. // (Support for GB11.)
  266. if current_sequence == .Emoji && (
  267. is_gcb_extend_class(last_rune) ||
  268. is_emoji_extended_pictographic(last_rune) ) {
  269. continue_sequence = true
  270. }
  271. continue
  272. }
  273. // _The GB9a and GB9b rules only apply to extended grapheme clusters:_
  274. // "Do not break before SpacingMarks, or after Prepend characters."
  275. //
  276. // GB9a: × SpacingMark
  277. // GB9b: Prepend ×
  278. if is_spacing_mark(this_rune) {
  279. continue
  280. }
  281. if is_gcb_prepend_class(this_rune) {
  282. grapheme_count += 1
  283. bypass_next_rune = true
  284. continue
  285. }
  286. // _The GB9c rule only applies to extended grapheme clusters:_
  287. // "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
  288. //
  289. // GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
  290. if is_indic_conjunct_break_consonant(this_rune) {
  291. if current_sequence == .Indic {
  292. if last_rune == ZERO_WIDTH_JOINER ||
  293. is_indic_conjunct_break_linker(last_rune) {
  294. continue_sequence = true
  295. } else {
  296. grapheme_count += 1
  297. }
  298. } else {
  299. grapheme_count += 1
  300. current_sequence = .Indic
  301. continue_sequence = true
  302. }
  303. continue
  304. }
  305. if is_indic_conjunct_break_extend(this_rune) {
  306. if current_sequence == .Indic {
  307. if is_indic_conjunct_break_consonant(last_rune) ||
  308. is_indic_conjunct_break_linker(last_rune) {
  309. continue_sequence = true
  310. } else {
  311. grapheme_count += 1
  312. }
  313. }
  314. continue
  315. }
  316. if is_indic_conjunct_break_linker(this_rune) {
  317. if current_sequence == .Indic {
  318. if is_indic_conjunct_break_extend(last_rune) ||
  319. is_indic_conjunct_break_linker(last_rune) {
  320. continue_sequence = true
  321. } else {
  322. grapheme_count += 1
  323. }
  324. }
  325. continue
  326. }
  327. //
  328. // (Curiously, there is no GB10.)
  329. //
  330. // "Do not break within emoji modifier sequences or emoji zwj sequences."
  331. //
  332. // GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
  333. if is_emoji_extended_pictographic(this_rune) {
  334. if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
  335. grapheme_count += 1
  336. }
  337. current_sequence = .Emoji
  338. continue_sequence = true
  339. continue
  340. }
  341. // "Do not break within emoji flag sequences.
  342. // That is, do not break between regional indicator (RI) symbols
  343. // if there is an odd number of RI characters before the break point."
  344. //
  345. // GB12: sot (RI RI)* RI × RI
  346. // GB13: [^RI] (RI RI)* RI × RI
  347. if is_regional_indicator(this_rune) {
  348. if regional_indicator_counter & 1 == 0 {
  349. grapheme_count += 1
  350. }
  351. current_sequence = .Regional
  352. continue_sequence = true
  353. regional_indicator_counter += 1
  354. continue
  355. }
  356. // "Otherwise, break everywhere."
  357. //
  358. // GB999: Any ÷ Any
  359. grapheme_count += 1
  360. }
  361. return
  362. }