浏览代码

Improved bbStringFromUTF8Bytes. (#369)

Added unit tests.
Brucey 4 月之前
父节点
当前提交
21217596aa
共有 2 个文件被更改,包括 209 次插入42 次删除
  1. 98 42
      blitz.mod/blitz_string.c
  2. 111 0
      blitz.mod/tests/test.bmx

+ 98 - 42
blitz.mod/blitz_string.c

@@ -312,49 +312,105 @@ BBString *bbStringFromUTF8String( const unsigned char *p ){
 	return p ? bbStringFromUTF8Bytes( p,strlen((char*)p) ) : &bbEmptyString;
 }
 
-BBString *bbStringFromUTF8Bytes( const unsigned char *p,int n ){
-	int c;
-	unsigned short *d,*q;
-	BBString *str;
+#define REPLACEMENT_CHAR 0xFFFD
+
+BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
+    if (!p || n <= 0) return &bbEmptyString;
+
+    // Allocate worst-case: one output code unit per input byte.
+    unsigned short *buffer = (unsigned short*)malloc(n * sizeof(unsigned short));
+    if (!buffer) return &bbEmptyString; // Allocation failed
+
+    unsigned short *dest = buffer;
+    const unsigned char *end = p + n;
+
+    while (p < end) {
+        unsigned int codepoint;
+        unsigned char byte = *p++;
+
+        if (byte < 0x80) {
+            // 1-byte (ASCII)
+            *dest++ = byte;
+        } else if (byte < 0xC0) {
+            // Unexpected continuation byte; insert replacement.
+            *dest++ = REPLACEMENT_CHAR;
+        } else if (byte < 0xE0) {
+            // 2-byte sequence: 110xxxxx 10xxxxxx
+            if (p >= end) {
+                *dest++ = REPLACEMENT_CHAR;
+                break;
+            }
+            unsigned char byte2 = *p++;
+            if ((byte2 & 0xC0) != 0x80) {
+                *dest++ = REPLACEMENT_CHAR;
+                continue;
+            }
+            codepoint = ((byte & 0x1F) << 6) | (byte2 & 0x3F);
+            if (codepoint < 0x80) { // Overlong encoding
+                *dest++ = REPLACEMENT_CHAR;
+            } else {
+                *dest++ = (unsigned short)codepoint;
+            }
+        } else if (byte < 0xF0) {
+            // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+            if (p + 1 >= end) {
+                *dest++ = REPLACEMENT_CHAR;
+                break;
+            }
+            unsigned char byte2 = *p++;
+            unsigned char byte3 = *p++;
+            if ((byte2 & 0xC0) != 0x80 || (byte3 & 0xC0) != 0x80) {
+                *dest++ = REPLACEMENT_CHAR;
+                continue;
+            }
+            codepoint = ((byte & 0x0F) << 12) |
+                        ((byte2 & 0x3F) << 6) |
+                        (byte3 & 0x3F);
+            // Reject overlong sequences and surrogate halves.
+            if (codepoint < 0x800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
+                *dest++ = REPLACEMENT_CHAR;
+            } else {
+                *dest++ = (unsigned short)codepoint;
+            }
+        } else if (byte < 0xF8) {
+            // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            if (p + 2 >= end) {
+                *dest++ = REPLACEMENT_CHAR;
+                break;
+            }
+            unsigned char byte2 = *p++;
+            unsigned char byte3 = *p++;
+            unsigned char byte4 = *p++;
+            if ((byte2 & 0xC0) != 0x80 ||
+                (byte3 & 0xC0) != 0x80 ||
+                (byte4 & 0xC0) != 0x80) {
+                *dest++ = REPLACEMENT_CHAR;
+                continue;
+            }
+            codepoint = ((byte & 0x07) << 18) |
+                        ((byte2 & 0x3F) << 12) |
+                        ((byte3 & 0x3F) << 6) |
+                        (byte4 & 0x3F);
+            // Ensure codepoint is within valid range.
+            if (codepoint < 0x10000 || codepoint > 0x10FFFF) {
+                *dest++ = REPLACEMENT_CHAR;
+            } else {
+                // Convert to surrogate pair.
+                codepoint -= 0x10000;
+                unsigned short highSurrogate = 0xD800 | ((codepoint >> 10) & 0x3FF);
+                unsigned short lowSurrogate  = 0xDC00 | (codepoint & 0x3FF);
+                *dest++ = highSurrogate;
+                *dest++ = lowSurrogate;
+            }
+        } else {
+            // Bytes above 0xF7 are invalid in modern UTF-8.
+            *dest++ = REPLACEMENT_CHAR;
+        }
+    }
 
-	if( !p || n <= 0 ) return &bbEmptyString;
-	
-	d=(unsigned short*)malloc( n*2 );
-	q=d;
-	
-	while( n-- && (c=*p++ & 0xff)){
-		if( c<0x80 ){
-			*q++=c;
-		}else{
-			if (!n--) break;
-			int d=*p++ & 0x3f;
-			if( c<0xe0 ){
-				*q++=((c&31)<<6) | d;
-			}else{
-				if (!n--) break;
-				int e=*p++ & 0x3f;
-				if( c<0xf0 ){
-					*q++=((c&15)<<12) | (d<<6) | e;
-				}else{
-					if (!n--) break;
-					int f=*p++ & 0x3f;
-					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
-					if( v & 0xffff0000 ) {
-						v -= 0x10000;
-						d = ((v >> 10) & 0x7ff) + 0xd800;
-						e = (v & 0x3ff) + 0xdc00;
-						*q++=d;
-						*q++=e;
-					}else{
-						*q++=v;
-					}
-				}
-			}
-		}
-	}
-	str=bbStringFromShorts( d,q-d );
-	free( d );
-	return str;
+    BBString *str = bbStringFromShorts(buffer, dest - buffer);
+    free(buffer);
+    return str;
 }
 
 BBString *bbStringToString( BBString *t ){

+ 111 - 0
blitz.mod/tests/test.bmx

@@ -330,3 +330,114 @@ Type TStringToIntExTest Extends TTest
 	End Method
 
 End Type
+
+Type TStringFromUTF8BytesTest Extends TTest
+
+    ' Test valid ASCII conversion.
+    Method testASCII() { test }
+        Local data:Byte[] = [72, 101, 108, 108, 111] ' "Hello"
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals("Hello", text)
+    End Method
+
+    ' Test conversion of a 2-byte UTF-8 sequence (e.g. ©: U+00A9).
+    Method testTwoByteSequence() { test }
+        ' © U+00A9: UTF-8: $C2, $A9.
+        Local data:Byte[] = [$C2, $A9]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($00A9), text)
+    End Method
+
+    ' Test conversion of a 3-byte UTF-8 sequence (e.g. €: U+20AC).
+    Method testThreeByteSequence() { test }
+        ' € U+20AC: UTF-8: $E2, $82, $AC.
+        Local data:Byte[] = [$E2, $82, $AC]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($20AC), text)
+    End Method
+
+    ' Test conversion of a 4-byte UTF-8 sequence (e.g. U+1F600: grinning face emoji).
+    Method testFourByteSequence() { test }
+        ' Grinning Face U+1F600: UTF-8: $F0, $9F, $98, $80.
+        Local data:Byte[] = [$F0, $9F, $98, $80]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        ' Expected string in UTF-16: surrogate pair (high: $D83D, low: $DE00).
+        Local expected:String = Chr($D83D) + Chr($DE00)
+        assertEquals(expected, text)
+    End Method
+
+    ' Test an incomplete sequence (missing continuation bytes).
+    Method testIncompleteSequence() { test }
+        ' Incomplete 3-byte sequence: [$E2, $82] missing the final byte.
+        Local data:Byte[] = [$E2, $82]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        ' Expect a replacement character.
+        assertEquals(Chr($FFFD), text)
+    End Method
+
+    ' Test an invalid continuation byte following a valid starter.
+    Method testInvalidContinuation() { test }
+        ' [$C2, $20]: $20 is not a valid continuation byte.
+        Local data:Byte[] = [$C2, $20]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($FFFD), text)
+    End Method
+
+    ' Test a stray continuation byte.
+    Method testStrayContinuation() { test }
+        ' A single continuation byte $80 is invalid.
+        Local data:Byte[] = [$80]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($FFFD), text)
+    End Method
+
+    ' Test a mix of valid and invalid sequences.
+    Method testMixedValidInvalid() { test }
+        ' "A" ($41), stray continuation ($80), then "B" ($42).
+        Local data:Byte[] = [65, $80, 66]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        Local expected:String = Chr(65) + Chr($FFFD) + Chr(66)
+        assertEquals(expected, text)
+    End Method
+
+    ' Test overlong encoding.
+    Method testOverlongEncoding() { test }
+        ' Overlong encoding for NUL: [$C0, $80] should be rejected.
+        Local data:Byte[] = [$C0, $80]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($FFFD), text)
+    End Method
+
+    ' Test a UTF-8 sequence encoding a surrogate half (e.g. U+D800).
+    Method testSurrogateHalf() { test }
+        ' U+D800 encoded in UTF-8: [$ED, $A0, $80].
+        Local data:Byte[] = [$ED, $A0, $80]
+        Local text:String = String.FromUTF8Bytes(data, data.Length)
+        assertEquals(Chr($FFFD), text)
+    End Method
+
+	' Test conversion of Russian "hello" ("привет").
+	Method testRussianHello() { test }
+		' "привет": [$D0, $BF, $D1, $80, $D0, $B8, $D0, $B2, $D0, $B5, $D1, $82]
+		Local data:Byte[] = [$D0, $BF, $D1, $80, $D0, $B8, $D0, $B2, $D0, $B5, $D1, $82]
+		Local text:String = String.FromUTF8Bytes(data, data.Length)
+		assertEquals("привет", text)
+	End Method
+
+	' Test conversion of Japanese "hello" ("こんにちは").
+	Method testJapaneseHello() { test }
+		' "こんにちは": [$E3, $81, $93, $E3, $82, $93, $E3, $81, $AB, $E3, $81, $A1, $E3, $81, $AF]
+		Local data:Byte[] = [$E3, $81, $93, $E3, $82, $93, $E3, $81, $AB, $E3, $81, $A1, $E3, $81, $AF]
+		Local text:String = String.FromUTF8Bytes(data, data.Length)
+		assertEquals("こんにちは", text)
+	End Method
+
+	' Test conversion of Chinese "hello" ("你好").
+	Method testChineseHello() { test }
+		' "你好": [$E4, $BD, $A0, $E5, $A5, $BD]
+		Local data:Byte[] = [$E4, $BD, $A0, $E5, $A5, $BD]
+		Local text:String = String.FromUTF8Bytes(data, data.Length)
+		assertEquals("你好", text)
+	End Method
+
+End Type