123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411 |
- package utf8
- import "core:unicode"
- ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER
- is_control :: unicode.is_control
- is_hangul_syllable_leading :: unicode.is_hangul_syllable_leading
- is_hangul_syllable_vowel :: unicode.is_hangul_syllable_vowel
- is_hangul_syllable_trailing :: unicode.is_hangul_syllable_trailing
- is_hangul_syllable_lv :: unicode.is_hangul_syllable_lv
- is_hangul_syllable_lvt :: unicode.is_hangul_syllable_lvt
- is_indic_conjunct_break_extend :: unicode.is_indic_conjunct_break_extend
- is_indic_conjunct_break_linker :: unicode.is_indic_conjunct_break_linker
- is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
- is_gcb_extend_class :: unicode.is_gcb_extend_class
- is_spacing_mark :: unicode.is_spacing_mark
- is_gcb_prepend_class :: unicode.is_gcb_prepend_class
- is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
- is_regional_indicator :: unicode.is_regional_indicator
- normalized_east_asian_width :: unicode.normalized_east_asian_width
- Grapheme :: struct {
- byte_index: int,
- rune_index: int,
- width: int,
- }
- /*
- Count the individual graphemes in a UTF-8 string.
- Inputs:
- - str: The input string.
- Returns:
- - graphemes: The number of graphemes in the string.
- - runes: The number of runes in the string.
- - width: The width of the string in number of monospace cells.
- */
- @(require_results)
- grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
- _, graphemes, runes, width = decode_grapheme_clusters(str, false)
- return
- }
- /*
- Decode the individual graphemes in a UTF-8 string.
- *Allocates Using Provided Allocator*
- Inputs:
- - str: The input string.
- - track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
- - allocator: (default: context.allocator)
- Returns:
- - graphemes: Extra data about each grapheme.
- - grapheme_count: The number of graphemes in the string.
- - rune_count: The number of runes in the string.
- - width: The width of the string in number of monospace cells.
- */
- @(require_results)
- decode_grapheme_clusters :: proc(
- str: string,
- track_graphemes := true,
- allocator := context.allocator,
- ) -> (
- graphemes: [dynamic]Grapheme,
- grapheme_count: int,
- rune_count: int,
- width: int,
- ) {
- // The following procedure implements text segmentation by breaking on
- // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
- // the Unicode® Standard Annex #29, entitled:
- //
- // UNICODE TEXT SEGMENTATION
- //
- // Version: Unicode 15.1.0
- // Date: 2023-08-16
- // Revision: 43
- //
- // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
- // extended, non-legacy ruleset.
- //
- // Please see the references below for more information.
- //
- //
- // NOTE(Feoramund): This procedure has not been highly optimized.
- // A couple opportunities were taken to bypass repeated checking when a
- // rune is outside of certain codepoint ranges, but little else has been
- // done. Standard switches, conditionals, and binary search are used to
- // see if a rune fits into a certain category.
- //
- // I did find that only one prior rune of state was necessary to build an
- // algorithm that successfully passes all 4,835 test cases provided with
- // this implementation from the Unicode organization's website.
- //
- // My initial implementation tracked explicit breaks and counted them once
- // the string iteration had terminated. I've found this current
- // implementation to be far simpler and need no allocations (unless the
- // caller wants position data).
- //
- // Most rules work backwards instead of forwards which has helped keep this
- // simple, despite its length and verbosity.
- //
- //
- // The implementation has been left verbose and in the order described by
- // the specification, to enable better readability and future upkeep.
- //
- // Some possible optimizations might include:
- //
- // - saving the type of `last_rune` instead of the exact rune.
- // - reordering rules.
- // - combining tables.
- //
- //
- // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
- // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
- // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- // [4]: https://www.unicode.org/reports/tr29/#Conformance
- // Additionally, this procedure now takes into account Standard Annex #11,
- // in order to estimate how visually wide the string will appear on a
- // monospaced display. This can only ever be a rough guess, as this tends
- // to be an implementation detail relating to which fonts are being used,
- // how codepoints are interpreted and drawn, if codepoint sequences are
- // interpreted correctly, and et cetera.
- //
- // For example, a program may not properly interpret an emoji modifier
- // sequence and print the component glyphs instead of one whole glyph.
- //
- // See here for more information: https://www.unicode.org/reports/tr11/
- //
- // NOTE: There is no explicit mention of what to do with zero-width spaces
- // as far as grapheme cluster segmentation goes, therefore this
- // implementation may count and return graphemes with a `width` of zero.
- //
- // Treat them as any other space.
- Grapheme_Cluster_Sequence :: enum {
- None,
- Indic,
- Emoji,
- Regional,
- }
- context.allocator = allocator
- last_rune: rune
- last_rune_breaks_forward: bool
- last_width: int
- last_grapheme_count: int
- bypass_next_rune: bool
- regional_indicator_counter: int
- current_sequence: Grapheme_Cluster_Sequence
- continue_sequence: bool
- for this_rune, byte_index in str {
- defer {
- // "Break at the start and end of text, unless the text is empty."
- //
- // GB1: sot ÷ Any
- // GB2: Any ÷ eot
- if rune_count == 0 && grapheme_count == 0 {
- grapheme_count += 1
- }
- if grapheme_count > last_grapheme_count {
- width += normalized_east_asian_width(this_rune)
- if track_graphemes {
- append(&graphemes, Grapheme{
- byte_index,
- rune_count,
- width - last_width,
- })
- }
- last_grapheme_count = grapheme_count
- last_width = width
- }
- last_rune = this_rune
- rune_count += 1
- if !continue_sequence {
- current_sequence = .None
- regional_indicator_counter = 0
- }
- continue_sequence = false
- }
- // "Do not break between a CR and LF. Otherwise, break before and after controls."
- //
- // GB3: CR × LF
- // GB4: (Control | CR | LF) ÷
- // GB5: ÷ (Control | CR | LF)
- if this_rune == '\n' && last_rune == '\r' {
- last_rune_breaks_forward = false
- bypass_next_rune = false
- continue
- }
- if is_control(this_rune) {
- grapheme_count += 1
- last_rune_breaks_forward = true
- bypass_next_rune = true
- continue
- }
- // (This check is for rules that work forwards, instead of backwards.)
- if bypass_next_rune {
- if last_rune_breaks_forward {
- grapheme_count += 1
- last_rune_breaks_forward = false
- }
- bypass_next_rune = false
- continue
- }
- // (Optimization 1: Prevent low runes from proceeding further.)
- //
- // * 0xA9 and 0xAE are in the Extended_Pictographic range,
- // which is checked later in GB11.
- if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
- grapheme_count += 1
- continue
- }
- // (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
- if 0x1100 <= this_rune && this_rune <= 0xD7FB {
- // "Do not break Hangul syllable sequences."
- //
- // GB6: L × (L | V | LV | LVT)
- // GB7: (LV | V) × (V | T)
- // GB8: (LVT | T) × T
- if is_hangul_syllable_leading(this_rune) ||
- is_hangul_syllable_lv(this_rune) ||
- is_hangul_syllable_lvt(this_rune) {
- if !is_hangul_syllable_leading(last_rune) {
- grapheme_count += 1
- }
- continue
- }
- if is_hangul_syllable_vowel(this_rune) {
- if is_hangul_syllable_leading(last_rune) ||
- is_hangul_syllable_vowel(last_rune) ||
- is_hangul_syllable_lv(last_rune) {
- continue
- }
- grapheme_count += 1
- continue
- }
- if is_hangul_syllable_trailing(this_rune) {
- if is_hangul_syllable_trailing(last_rune) ||
- is_hangul_syllable_lvt(last_rune) ||
- is_hangul_syllable_lv(last_rune) ||
- is_hangul_syllable_vowel(last_rune) {
- continue
- }
- grapheme_count += 1
- continue
- }
- }
- // "Do not break before extending characters or ZWJ."
- //
- // GB9: × (Extend | ZWJ)
- if this_rune == ZERO_WIDTH_JOINER {
- continue_sequence = true
- continue
- }
- if is_gcb_extend_class(this_rune) {
- // (Support for GB9c.)
- if current_sequence == .Indic {
- if is_indic_conjunct_break_extend(this_rune) && (
- is_indic_conjunct_break_linker(last_rune) ||
- is_indic_conjunct_break_consonant(last_rune) ) {
- continue_sequence = true
- continue
- }
- if is_indic_conjunct_break_linker(this_rune) && (
- is_indic_conjunct_break_linker(last_rune) ||
- is_indic_conjunct_break_extend(last_rune) ||
- is_indic_conjunct_break_consonant(last_rune) ) {
- continue_sequence = true
- continue
- }
- continue
- }
- // (Support for GB11.)
- if current_sequence == .Emoji && (
- is_gcb_extend_class(last_rune) ||
- is_emoji_extended_pictographic(last_rune) ) {
- continue_sequence = true
- }
- continue
- }
- // _The GB9a and GB9b rules only apply to extended grapheme clusters:_
- // "Do not break before SpacingMarks, or after Prepend characters."
- //
- // GB9a: × SpacingMark
- // GB9b: Prepend ×
- if is_spacing_mark(this_rune) {
- continue
- }
- if is_gcb_prepend_class(this_rune) {
- grapheme_count += 1
- bypass_next_rune = true
- continue
- }
- // _The GB9c rule only applies to extended grapheme clusters:_
- // "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
- //
- // GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
- if is_indic_conjunct_break_consonant(this_rune) {
- if current_sequence == .Indic {
- if last_rune == ZERO_WIDTH_JOINER ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
- } else {
- grapheme_count += 1
- }
- } else {
- grapheme_count += 1
- current_sequence = .Indic
- continue_sequence = true
- }
- continue
- }
- if is_indic_conjunct_break_extend(this_rune) {
- if current_sequence == .Indic {
- if is_indic_conjunct_break_consonant(last_rune) ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
- } else {
- grapheme_count += 1
- }
- }
- continue
- }
- if is_indic_conjunct_break_linker(this_rune) {
- if current_sequence == .Indic {
- if is_indic_conjunct_break_extend(last_rune) ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
- } else {
- grapheme_count += 1
- }
- }
- continue
- }
- //
- // (Curiously, there is no GB10.)
- //
- // "Do not break within emoji modifier sequences or emoji zwj sequences."
- //
- // GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
- if is_emoji_extended_pictographic(this_rune) {
- if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
- grapheme_count += 1
- }
- current_sequence = .Emoji
- continue_sequence = true
- continue
- }
- // "Do not break within emoji flag sequences.
- // That is, do not break between regional indicator (RI) symbols
- // if there is an odd number of RI characters before the break point."
- //
- // GB12: sot (RI RI)* RI × RI
- // GB13: [^RI] (RI RI)* RI × RI
- if is_regional_indicator(this_rune) {
- if regional_indicator_counter & 1 == 0 {
- grapheme_count += 1
- }
- current_sequence = .Regional
- continue_sequence = true
- regional_indicator_counter += 1
- continue
- }
- // "Otherwise, break everywhere."
- //
- // GB999: Any ÷ Any
- grapheme_count += 1
- }
- return
- }
|