1 year ago · e620645a03
--- a/core/unicode/letter.odin
+++ b/core/unicode/letter.odin
@@ -5,8 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd'     // Represented an invalid code point
 
				 MAX_ASCII        :: '\u007f'     // Maximum ASCII value
			
 
				 MAX_LATIN1       :: '\u00ff'     // Maximum Latin-1 value
			
 
				 
			
 
				+ZERO_WIDTH_SPACE      :: '\u200B'
			
 
				 ZERO_WIDTH_NON_JOINER :: '\u200C'
			
 
				 ZERO_WIDTH_JOINER     :: '\u200D'
			
 
				+WORD_JOINER           :: '\u2060'
			
 
				 
			
 
				 @(require_results)
			
 
				 binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
			
@@ -450,6 +452,41 @@ is_gcb_extend_class :: proc(r: rune) -> bool {
 
				 	return is_grapheme_extend(r) || is_emoji_modifier(r)
			
 
				 }
			
 
				 
			
 
				+// Return values:
			
 
				+//
			
 
				+// - 2 if East_Asian_Width=F or W, or
			
 
				+// - 0 if non-printable / zero-width, or
			
 
				+// - 1 in all other cases.
			
 
				+//
			
 
				+@(require_results)
			
 
				+normalized_east_asian_width :: proc(r: rune) -> int {
			
 
				+	// This is a different interpretation of the BOM which occurs in the middle of text.
			
 
				+	ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'
			
 
				+
			
 
				+	if is_control(r) {
			
 
				+		return 0
			
 
				+	} else if r <= 0x10FF {
			
 
				+		// Easy early out for low runes.
			
 
				+		return 1
			
 
				+	}
			
 
				+
			
 
				+	switch r {
			
 
				+	case ZERO_WIDTH_NO_BREAK_SPACE,
			
 
				+	     ZERO_WIDTH_SPACE,
			
 
				+	     ZERO_WIDTH_NON_JOINER,
			
 
				+	     ZERO_WIDTH_JOINER,
			
 
				+	     WORD_JOINER:
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	c := i32(r)
			
 
				+	p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
			
 
				+	if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
			
 
				+		return cast(int)normalized_east_asian_width_ranges[p+2]
			
 
				+	}
			
 
				+	return 1
			
 
				+}
			
 
				+
			
 
				 //
			
 
				 // End of Unicode 15.1.0 block.
			
 
				 //
			
--- a/core/unicode/tables.odin
+++ b/core/unicode/tables.odin
@@ -3716,6 +3716,177 @@ indic_conjunct_break_extend_ranges := [?]i32 {
 
				 	0x1E944, 0x1E94A,
			
 
				 }
			
 
				 
			
 
				+// Fullwidth (F) and Wide (W) are counted as 2.
			
 
				+// Everything else is 1.
			
 
				+//
			
 
				+// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
			
 
				+@(rodata)
			
 
				+normalized_east_asian_width_ranges := [?]i32 {
			
 
				+	0x0000, 0x10FF, 1,
			
 
				+	0x1100, 0x115F, 2,
			
 
				+	0x1160, 0x2319, 1,
			
 
				+	0x231A, 0x231B, 2,
			
 
				+	0x231C, 0x2328, 1,
			
 
				+	0x2329, 0x232A, 2,
			
 
				+	0x232B, 0x23E8, 1,
			
 
				+	0x23E9, 0x23EC, 2,
			
 
				+	0x23ED, 0x23EF, 1,
			
 
				+	0x23F0, 0x23F0, 2,
			
 
				+	0x23F1, 0x23F2, 1,
			
 
				+	0x23F3, 0x23F3, 2,
			
 
				+	0x23F4, 0x25FC, 1,
			
 
				+	0x25FD, 0x25FE, 2,
			
 
				+	0x25FF, 0x2613, 1,
			
 
				+	0x2614, 0x2615, 2,
			
 
				+	0x2616, 0x2647, 1,
			
 
				+	0x2648, 0x2653, 2,
			
 
				+	0x2654, 0x267E, 1,
			
 
				+	0x267F, 0x267F, 2,
			
 
				+	0x2680, 0x2692, 1,
			
 
				+	0x2693, 0x2693, 2,
			
 
				+	0x2694, 0x26A0, 1,
			
 
				+	0x26A1, 0x26A1, 2,
			
 
				+	0x26A2, 0x26A9, 1,
			
 
				+	0x26AA, 0x26AB, 2,
			
 
				+	0x26AC, 0x26BC, 1,
			
 
				+	0x26BD, 0x26BE, 2,
			
 
				+	0x26BF, 0x26C3, 1,
			
 
				+	0x26C4, 0x26C5, 2,
			
 
				+	0x26C6, 0x26CD, 1,
			
 
				+	0x26CE, 0x26CE, 2,
			
 
				+	0x26CF, 0x26D3, 1,
			
 
				+	0x26D4, 0x26D4, 2,
			
 
				+	0x26D5, 0x26E9, 1,
			
 
				+	0x26EA, 0x26EA, 2,
			
 
				+	0x26EB, 0x26F1, 1,
			
 
				+	0x26F2, 0x26F3, 2,
			
 
				+	0x26F4, 0x26F4, 1,
			
 
				+	0x26F5, 0x26F5, 2,
			
 
				+	0x26F6, 0x26F9, 1,
			
 
				+	0x26FA, 0x26FA, 2,
			
 
				+	0x26FB, 0x26FC, 1,
			
 
				+	0x26FD, 0x26FD, 2,
			
 
				+	0x26FE, 0x2704, 1,
			
 
				+	0x2705, 0x2705, 2,
			
 
				+	0x2706, 0x2709, 1,
			
 
				+	0x270A, 0x270B, 2,
			
 
				+	0x270C, 0x2727, 1,
			
 
				+	0x2728, 0x2728, 2,
			
 
				+	0x2729, 0x274B, 1,
			
 
				+	0x274C, 0x274C, 2,
			
 
				+	0x274D, 0x274D, 1,
			
 
				+	0x274E, 0x274E, 2,
			
 
				+	0x274F, 0x2752, 1,
			
 
				+	0x2753, 0x2755, 2,
			
 
				+	0x2756, 0x2756, 1,
			
 
				+	0x2757, 0x2757, 2,
			
 
				+	0x2758, 0x2794, 1,
			
 
				+	0x2795, 0x2797, 2,
			
 
				+	0x2798, 0x27AF, 1,
			
 
				+	0x27B0, 0x27B0, 2,
			
 
				+	0x27B1, 0x27BE, 1,
			
 
				+	0x27BF, 0x27BF, 2,
			
 
				+	0x27C0, 0x2B1A, 1,
			
 
				+	0x2B1B, 0x2B1C, 2,
			
 
				+	0x2B1D, 0x2B4F, 1,
			
 
				+	0x2B50, 0x2B50, 2,
			
 
				+	0x2B51, 0x2B54, 1,
			
 
				+	0x2B55, 0x2B55, 2,
			
 
				+	0x2B56, 0x2E5D, 1,
			
 
				+	0x2E80, 0x303E, 2,
			
 
				+	0x303F, 0x303F, 1,
			
 
				+	0x3041, 0x3247, 2,
			
 
				+	0x3248, 0x324F, 1,
			
 
				+	0x3250, 0x4DBF, 2,
			
 
				+	0x4DC0, 0x4DFF, 1,
			
 
				+	0x4E00, 0xA4C6, 2,
			
 
				+	0xA4D0, 0xA95F, 1,
			
 
				+	0xA960, 0xA97C, 2,
			
 
				+	0xA980, 0xABF9, 1,
			
 
				+	0xAC00, 0xD7A3, 2,
			
 
				+	0xD7B0, 0xF8FF, 1,
			
 
				+	0xF900, 0xFAFF, 2,
			
 
				+	0xFB00, 0xFE0F, 1,
			
 
				+	0xFE10, 0xFE19, 2,
			
 
				+	0xFE20, 0xFE2F, 1,
			
 
				+	0xFE30, 0xFE6B, 2,
			
 
				+	0xFE70, 0xFEFF, 1,
			
 
				+	0xFF01, 0xFF60, 2,
			
 
				+	0xFF61, 0xFFDC, 1,
			
 
				+	0xFFE0, 0xFFE6, 2,
			
 
				+	0xFFE8, 0x16F9F, 1,
			
 
				+	0x16FE0, 0x1B2FB, 2,
			
 
				+	0x1BC00, 0x1F003, 1,
			
 
				+	0x1F004, 0x1F004, 2,
			
 
				+	0x1F005, 0x1F0CE, 1,
			
 
				+	0x1F0CF, 0x1F0CF, 2,
			
 
				+	0x1F0D1, 0x1F18D, 1,
			
 
				+	0x1F18E, 0x1F18E, 2,
			
 
				+	0x1F18F, 0x1F190, 1,
			
 
				+	0x1F191, 0x1F19A, 2,
			
 
				+	0x1F19B, 0x1F1FF, 1,
			
 
				+	0x1F200, 0x1F320, 2,
			
 
				+	0x1F321, 0x1F32C, 1,
			
 
				+	0x1F32D, 0x1F335, 2,
			
 
				+	0x1F336, 0x1F336, 1,
			
 
				+	0x1F337, 0x1F37C, 2,
			
 
				+	0x1F37D, 0x1F37D, 1,
			
 
				+	0x1F37E, 0x1F393, 2,
			
 
				+	0x1F394, 0x1F39F, 1,
			
 
				+	0x1F3A0, 0x1F3CA, 2,
			
 
				+	0x1F3CB, 0x1F3CE, 1,
			
 
				+	0x1F3CF, 0x1F3D3, 2,
			
 
				+	0x1F3D4, 0x1F3DF, 1,
			
 
				+	0x1F3E0, 0x1F3F0, 2,
			
 
				+	0x1F3F1, 0x1F3F3, 1,
			
 
				+	0x1F3F4, 0x1F3F4, 2,
			
 
				+	0x1F3F5, 0x1F3F7, 1,
			
 
				+	0x1F3F8, 0x1F43E, 2,
			
 
				+	0x1F43F, 0x1F43F, 1,
			
 
				+	0x1F440, 0x1F440, 2,
			
 
				+	0x1F441, 0x1F441, 1,
			
 
				+	0x1F442, 0x1F4FC, 2,
			
 
				+	0x1F4FD, 0x1F4FE, 1,
			
 
				+	0x1F4FF, 0x1F53D, 2,
			
 
				+	0x1F53E, 0x1F54A, 1,
			
 
				+	0x1F54B, 0x1F54E, 2,
			
 
				+	0x1F54F, 0x1F54F, 1,
			
 
				+	0x1F550, 0x1F567, 2,
			
 
				+	0x1F568, 0x1F579, 1,
			
 
				+	0x1F57A, 0x1F57A, 2,
			
 
				+	0x1F57B, 0x1F594, 1,
			
 
				+	0x1F595, 0x1F596, 2,
			
 
				+	0x1F597, 0x1F5A3, 1,
			
 
				+	0x1F5A4, 0x1F5A4, 2,
			
 
				+	0x1F5A5, 0x1F5FA, 1,
			
 
				+	0x1F5FB, 0x1F64F, 2,
			
 
				+	0x1F650, 0x1F67F, 1,
			
 
				+	0x1F680, 0x1F6C5, 2,
			
 
				+	0x1F6C6, 0x1F6CB, 1,
			
 
				+	0x1F6CC, 0x1F6CC, 2,
			
 
				+	0x1F6CD, 0x1F6CF, 1,
			
 
				+	0x1F6D0, 0x1F6D2, 2,
			
 
				+	0x1F6D3, 0x1F6D4, 1,
			
 
				+	0x1F6D5, 0x1F6DF, 2,
			
 
				+	0x1F6E0, 0x1F6EA, 1,
			
 
				+	0x1F6EB, 0x1F6EC, 2,
			
 
				+	0x1F6F0, 0x1F6F3, 1,
			
 
				+	0x1F6F4, 0x1F6FC, 2,
			
 
				+	0x1F700, 0x1F7D9, 1,
			
 
				+	0x1F7E0, 0x1F7F0, 2,
			
 
				+	0x1F800, 0x1F90B, 1,
			
 
				+	0x1F90C, 0x1F93A, 2,
			
 
				+	0x1F93B, 0x1F93B, 1,
			
 
				+	0x1F93C, 0x1F945, 2,
			
 
				+	0x1F946, 0x1F946, 1,
			
 
				+	0x1F947, 0x1F9FF, 2,
			
 
				+	0x1FA00, 0x1FA6D, 1,
			
 
				+	0x1FA70, 0x1FAF8, 2,
			
 
				+	0x1FB00, 0x1FBF9, 1,
			
 
				+	0x20000, 0x3FFFD, 2,
			
 
				+	0xE0001, 0x10FFFD, 1,
			
 
				+}
			
 
				+
			
 
				 //
			
 
				 // End of Unicode 15.1.0 block.
			
 
				 //
			
--- a/core/unicode/utf8/grapheme.odin
+++ b/core/unicode/utf8/grapheme.odin
@@ -17,11 +17,13 @@ is_spacing_mark                   :: unicode.is_spacing_mark
 
				 is_gcb_prepend_class              :: unicode.is_gcb_prepend_class
			
 
				 is_emoji_extended_pictographic    :: unicode.is_emoji_extended_pictographic
			
 
				 is_regional_indicator             :: unicode.is_regional_indicator
			
 
				+normalized_east_asian_width       :: unicode.normalized_east_asian_width
			
 
				 
			
 
				 
			
 
				 Grapheme :: struct {
			
 
				 	byte_index: int,
			
 
				 	rune_index: int,
			
 
				+	width: int,
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -33,10 +35,11 @@ Inputs:
 
				 Returns:
			
 
				 - graphemes: The number of graphemes in the string.
			
 
				 - runes: The number of runes in the string.
			
 
				+- width: The width of the string in number of monospace cells.
			
 
				 */
			
 
				 @(require_results)
			
 
				-grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
			
 
				-	_, graphemes, runes = decode_grapheme_clusters(str, false)
			
 
				+grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
			
 
				+	_, graphemes, runes, width = decode_grapheme_clusters(str, false)
			
 
				 	return
			
 
				 }
			
 
				 
			
@@ -54,6 +57,7 @@ Returns:
 
				 - graphemes: Extra data about each grapheme.
			
 
				 - grapheme_count: The number of graphemes in the string.
			
 
				 - rune_count: The number of runes in the string.
			
 
				+- width: The width of the string in number of monospace cells.
			
 
				 */
			
 
				 @(require_results)
			
 
				 decode_grapheme_clusters :: proc(
			
@@ -64,6 +68,7 @@ decode_grapheme_clusters :: proc(
 
				 	graphemes:      [dynamic]Grapheme,
			
 
				 	grapheme_count: int,
			
 
				 	rune_count:     int,
			
 
				+	width:          int,
			
 
				 ) {
			
 
				 	// The following procedure implements text segmentation by breaking on
			
 
				 	// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
			
@@ -115,6 +120,24 @@ decode_grapheme_clusters :: proc(
 
				 	// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
			
 
				 	// [4]: https://www.unicode.org/reports/tr29/#Conformance
			
 
				 
			
 
				+	// Additionally, this procedure now takes into account Standard Annex #11,
			
 
				+	// in order to estimate how visually wide the string will appear on a
			
 
				+	// monospaced display. This can only ever be a rough guess, as this tends
			
 
				+	// to be an implementation detail relating to which fonts are being used,
			
 
				+	// how codepoints are interpreted and drawn, if codepoint sequences are
			
 
				+	// interpreted correctly, and et cetera.
			
 
				+	//
			
 
				+	// For example, a program may not properly interpret an emoji modifier
			
 
				+	// sequence and print the component glyphs instead of one whole glyph.
			
 
				+	//
			
 
				+	// See here for more information: https://www.unicode.org/reports/tr11/
			
 
				+	//
			
 
				+	// NOTE: There is no explicit mention of what to do with zero-width spaces
			
 
				+	// as far as grapheme cluster segmentation goes, therefore this
			
 
				+	// implementation may count and return graphemes with a `width` of zero.
			
 
				+	//
			
 
				+	// Treat them as any other space.
			
 
				+
			
 
				 	Grapheme_Cluster_Sequence :: enum {
			
 
				 		None,
			
 
				 		Indic,
			
@@ -127,6 +150,7 @@ decode_grapheme_clusters :: proc(
 
				 	last_rune: rune
			
 
				 	last_rune_breaks_forward: bool
			
 
				 
			
 
				+	last_width: int
			
 
				 	last_grapheme_count: int
			
 
				 
			
 
				 	bypass_next_rune: bool
			
@@ -145,10 +169,19 @@ decode_grapheme_clusters :: proc(
 
				 			if rune_count == 0 && grapheme_count == 0 {
			
 
				 				grapheme_count += 1
			
 
				 			}
			
 
				-			if track_graphemes && grapheme_count > last_grapheme_count {
			
 
				-				append(&graphemes, Grapheme{ byte_index, rune_count })
			
 
				+
			
 
				+			if grapheme_count > last_grapheme_count {
			
 
				+				width += normalized_east_asian_width(this_rune)
			
 
				+				if track_graphemes {
			
 
				+					append(&graphemes, Grapheme{
			
 
				+						byte_index,
			
 
				+						rune_count,
			
 
				+						width - last_width,
			
 
				+					})
			
 
				+				}
			
 
				+				last_grapheme_count = grapheme_count
			
 
				+				last_width = width
			
 
				 			}
			
 
				-			last_grapheme_count = grapheme_count
			
 
				 
			
 
				 			last_rune = this_rune
			
 
				 			rune_count += 1
			
--- a/tests/core/unicode/test_core_unicode.odin
+++ b/tests/core/unicode/test_core_unicode.odin
@@ -13,7 +13,7 @@ run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_lo
 
				 	failed := 0
			
 
				 	for c, i in test_cases {
			
 
				 		log.debugf("(#% 4i) %q ...", i, c.str)
			
 
				-		result, _ := utf8.grapheme_count(c.str)
			
 
				+		result, _, _ := utf8.grapheme_count(c.str)
			
 
				 		if !testing.expectf(t, result == c.expected_clusters,
			
 
				 			"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
			
 
				 			loc = loc)
			
@@ -43,7 +43,7 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
 
				 
			
 
				 	str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
			
 
				 
			
 
				-	graphemes, _, _ := utf8.decode_grapheme_clusters(str)
			
 
				+	graphemes, _, _, _ := utf8.decode_grapheme_clusters(str)
			
 
				 	defer delete(graphemes)
			
 
				 
			
 
				 	defer if testing.failed(t) {
			
@@ -71,3 +71,65 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
 
				 	testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
			
 
				 	testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
			
 
				 }
			
 
				+
			
 
				+@test
			
 
				+test_width :: proc(t: ^testing.T) {
			
 
				+	{
			
 
				+		str := "He\u200dllo"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 5)
			
 
				+		testing.expect_value(t, width, 5)
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		// Note that a zero-width space is still considered a grapheme as far
			
 
				+		// as the specification is concerned.
			
 
				+		str := "He\u200bllo"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 6)
			
 
				+		testing.expect_value(t, width, 5)
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		str := "\U0001F926\U0001F3FC\u200D\u2642"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 1)
			
 
				+		testing.expect_value(t, width, 2)
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		str := "H̷e̶l̵l̸o̴p̵e̷ ̸w̶o̸r̵l̶d̵!̴"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 14)
			
 
				+		testing.expect_value(t, width, 14)
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		str := "aカ.ヒフ"
			
 
				+		graphemes, grapheme_count, _, width := utf8.decode_grapheme_clusters(str)
			
 
				+		defer delete(graphemes)
			
 
				+		testing.expect_value(t, grapheme_count, 5)
			
 
				+		testing.expect_value(t, width, 8)
			
 
				+		if grapheme_count == 5 {
			
 
				+			testing.expect_value(t, graphemes[0].width, 1)
			
 
				+			testing.expect_value(t, graphemes[1].width, 2)
			
 
				+			testing.expect_value(t, graphemes[2].width, 1)
			
 
				+			testing.expect_value(t, graphemes[3].width, 2)
			
 
				+			testing.expect_value(t, graphemes[4].width, 2)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		str := "いろはにほへ"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 6)
			
 
				+		testing.expect_value(t, width, 12)
			
 
				+	}
			
 
				+
			
 
				+	{
			
 
				+		str := "舍利弗，是諸法空相，不生不滅，不垢不淨，不增不減。"
			
 
				+		graphemes, _, width := utf8.grapheme_count(str)
			
 
				+		testing.expect_value(t, graphemes, 25)
			
 
				+		testing.expect_value(t, width, 50)
			
 
				+	}
			
 
				+}