Jelajahi Sumber

Include unicode check in case conversions. (#347)

ToLower and ToUpper now include unicode case checks in an effort to save creating strings which are already in the correct case.
Brucey 7 bulan lalu
induk
melakukan
da1d1a09c0
2 mengubah file dengan 189 tambahan dan 14 penghapusan
  1. 62 14
      blitz.mod/blitz_string.c
  2. 127 0
      blitz.mod/tests/test.bmx

+ 62 - 14
blitz.mod/blitz_string.c

@@ -849,13 +849,37 @@ BBString *bbStringToLower( BBString *str ){
 	int n = 0;
 	int n = 0;
 	
 	
 	while (n < str->length) {
 	while (n < str->length) {
-		int c=str->buf[n];
-		// ascii upper or other unicode char
-		if (c >= 192 || (c>='A' && c<='Z')) {
-			break;
-		}
-		++n;
-	}
+        int c = str->buf[n];
+        if (c < 192) {
+            // ASCII character
+            if (c >= 'A' && c <= 'Z') {
+                // Found an uppercase ASCII character
+                break;
+            }
+        } else {
+            // Unicode character
+            // Check if the character is an uppercase Unicode character
+            int lo = 0, hi = (3828 / 4) - 1; // sizeof(bbToLowerData) = 3828
+            int is_upper = 0;
+            while (lo <= hi) {
+                int mid = (lo + hi) / 2;
+                int upper = bbToLowerData[mid * 2];
+                if (c < upper) {
+                    hi = mid - 1;
+                } else if (c > upper) {
+                    lo = mid + 1;
+                } else {
+                    // Found an uppercase Unicode character
+                    is_upper = 1;
+                    break;
+                }
+            }
+            if (is_upper) {
+                break;
+            }
+        }
+        ++n;
+    }
 	
 	
 	if (n == str->length) {
 	if (n == str->length) {
 		return str;
 		return str;
@@ -896,13 +920,37 @@ BBString *bbStringToUpper( BBString *str ){
 	int n = 0;
 	int n = 0;
 	
 	
 	while (n < str->length) {
 	while (n < str->length) {
-		int c=str->buf[n];
-		// ascii lower or other unicode char
-		if (c >= 181 || (c>='a' && c<='z')) {
-			break;
-		}
-		++n;
-	}
+        int c = str->buf[n];
+        if (c < 181) {
+            // ASCII character
+            if (c >= 'a' && c <= 'z') {
+                // Found a lowercase ASCII character
+                break;
+            }
+        } else {
+            // Unicode character
+            // Check if the character is a lowercase Unicode character
+            int lo = 0, hi = (3860 / 4) - 1; // sizeof(bbToUpperData) = 3860
+            int is_lower = 0;
+            while (lo <= hi) {
+                int mid = (lo + hi) / 2;
+                int lower = bbToUpperData[mid * 2];
+                if (c < lower) {
+                    hi = mid - 1;
+                } else if (c > lower) {
+                    lo = mid + 1;
+                } else {
+                    // Found a lowercase Unicode character
+                    is_lower = 1;
+                    break;
+                }
+            }
+            if (is_lower) {
+                break;
+            }
+        }
+        ++n;
+    }
 	
 	
 	if (n == str->length) {
 	if (n == str->length) {
 		return str;
 		return str;

+ 127 - 0
blitz.mod/tests/test.bmx

@@ -10,6 +10,15 @@ Type TStringTest Extends TTest
 	Field bigUnicode:UInt[] = [$10300, $10301, $10302, $10303, $10304, $10305, 0]
 	Field bigUnicode:UInt[] = [$10300, $10301, $10302, $10303, $10304, $10305, 0]
 	Field unicode:Int[] = [1055, 1088, 1080, 1074, 1077, 1090]
 	Field unicode:Int[] = [1055, 1088, 1080, 1074, 1077, 1090]
 	Field utf8:Byte[] = [208, 159, 209, 128, 208, 184, 208, 178, 208, 181, 209, 130, 0]
 	Field utf8:Byte[] = [208, 159, 209, 128, 208, 184, 208, 178, 208, 181, 209, 130, 0]
+
+	Const HELLO_UPPER:String = "HELLO"
+	Const HELLO_LOWER:String = "hello"
+	Const UMLAUT_UPPER:String = "123ÄÖÜABC"
+	Const UMLAUT_LOWER:String = "123äöüabc"
+	Const ARABIC_UPPER:String = "123كلمة"
+	Const ARABIC_LOWER:String = "123كلمة"
+	Const CYRILLIC_UPPER:String = "123БУДИНОК"
+	Const CYRILLIC_LOWER:String = "123будинок"
 	
 	
 	Method setup() { before }
 	Method setup() { before }
 	End Method
 	End Method
@@ -33,6 +42,124 @@ Type TStringTest Extends TTest
 
 
 	End Method
 	End Method
 
 
+	Method testASCIIToLower() { test }
+		Local s:String = HELLO_UPPER
+		assertEquals(HELLO_LOWER, s.ToLower())
+
+		Local obj:Object = HELLO_LOWER
+		Local obj1:Object = HELLO_LOWER.ToLower()
+
+		assertTrue(obj = obj1, "Already lowercase ASCII strings should return the same object")
+
+	End Method
+
+	Method testASCIIToUpper() { test }
+		Local s:String = HELLO_LOWER
+		assertEquals(HELLO_UPPER, s.ToUpper())
+
+		Local obj:Object = HELLO_UPPER
+		Local obj1:Object = HELLO_UPPER.ToUpper()
+
+		assertTrue(obj = obj1, "Already uppercase ASCII strings should return the same object")
+
+	End Method
+
+	Method testUnicodeToLower() { test }
+
+		Local s:String = UMLAUT_UPPER
+		assertEquals(UMLAUT_LOWER, s.ToLower())
+
+		Local obj:Object = UMLAUT_LOWER
+		Local obj1:Object = UMLAUT_LOWER.ToLower()
+
+		assertTrue(obj = obj1, "Already lowercase Unicode strings should return the same object")
+
+	End Method
+
+	Method testUnicodeToUpper() { test }
+
+		Local s:String = UMLAUT_LOWER
+		assertEquals(UMLAUT_UPPER, s.ToUpper())
+
+		Local obj:Object = UMLAUT_UPPER
+		Local obj1:Object = UMLAUT_UPPER.ToUpper()
+
+		assertTrue(obj = obj1, "Already uppercase Unicode strings should return the same object")
+
+	End Method
+
+	Method testArabicToLower() { test }
+
+		Local s:String = ARABIC_UPPER
+		assertEquals(ARABIC_LOWER, s.ToLower(), "Arabic lower case")
+
+		Local obj:Object = ARABIC_LOWER
+		Local obj1:Object = ARABIC_LOWER.ToLower()
+
+		assertTrue(obj = obj1, "Already lowercase Arabic strings should return the same object")
+
+	End Method
+
+	Method testArabicToUpper() { test }
+
+		Local s:String = ARABIC_LOWER
+		assertEquals(ARABIC_UPPER, s.ToUpper(), "Arabic upper case")
+
+		Local obj:Object = ARABIC_UPPER
+		Local obj1:Object = ARABIC_UPPER.ToUpper()
+
+		assertTrue(obj = obj1, "Already uppercase Arabic strings should return the same object")
+
+	End Method
+
+	Method testArabicUpperToLower() { test }
+
+		Local s:String = ARABIC_UPPER
+		assertEquals(ARABIC_UPPER, s.ToLower(), "Arabic lower case and upper case should be the same")
+
+		Local obj:Object = ARABIC_UPPER
+		Local obj1:Object = ARABIC_UPPER.ToLower()
+
+		assertTrue(obj = obj1, "Uppercase Arabic strings should return the same object when lowered")
+
+	End Method
+
+	Method testArabicLowerToUpper() { test }
+
+		Local s:String = ARABIC_LOWER
+		assertEquals(ARABIC_LOWER, s.ToUpper(), "Arabic upper case and lower case should be the same")
+
+		Local obj:Object = ARABIC_LOWER
+		Local obj1:Object = ARABIC_LOWER.ToUpper()
+
+		assertTrue(obj = obj1, "Lowercase Arabic strings should return the same object when uppered")
+
+	End Method
+
+	Method testCyrillicToLower() { test }
+
+		Local s:String = CYRILLIC_UPPER
+		assertEquals(CYRILLIC_LOWER, s.ToLower(), "Cyrillic lower case")
+
+		Local obj:Object = CYRILLIC_LOWER
+		Local obj1:Object = CYRILLIC_LOWER.ToLower()
+
+		assertTrue(obj = obj1, "Already lowercase Cyrillic strings should return the same object")
+
+	End Method
+
+	Method testCyrrilicToUpper() { test }
+
+		Local s:String = CYRILLIC_LOWER
+		assertEquals(CYRILLIC_UPPER, s.ToUpper(), "Cyrillic upper case")
+
+		Local obj:Object = CYRILLIC_UPPER
+		Local obj1:Object = CYRILLIC_UPPER.ToUpper()
+
+		assertTrue(obj = obj1, "Already uppercase Cyrillic strings should return the same object")
+
+	End Method
+
 End Type
 End Type
 
 
 Struct STestStruct
 Struct STestStruct