6 kuukautta sitten · 21217596aa
--- a/blitz.mod/blitz_string.c
+++ b/blitz.mod/blitz_string.c
@@ -312,49 +312,105 @@ BBString *bbStringFromUTF8String( const unsigned char *p ){
 
				 	return p ? bbStringFromUTF8Bytes( p,strlen((char*)p) ) : &bbEmptyString;

			
 
				 }

			
 
				 

			
 
				-BBString *bbStringFromUTF8Bytes( const unsigned char *p,int n ){

			
 
				-	int c;

			
 
				-	unsigned short *d,*q;

			
 
				-	BBString *str;

			
 
				+#define REPLACEMENT_CHAR 0xFFFD

			
 
				+

			
 
				+BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {

			
 
				+    if (!p || n <= 0) return &bbEmptyString;

			
 
				+

			
 
				+    // Allocate worst-case: one output code unit per input byte.

			
 
				+    unsigned short *buffer = (unsigned short*)malloc(n * sizeof(unsigned short));

			
 
				+    if (!buffer) return &bbEmptyString; // Allocation failed

			
 
				+

			
 
				+    unsigned short *dest = buffer;

			
 
				+    const unsigned char *end = p + n;

			
 
				+

			
 
				+    while (p < end) {

			
 
				+        unsigned int codepoint;

			
 
				+        unsigned char byte = *p++;

			
 
				+

			
 
				+        if (byte < 0x80) {

			
 
				+            // 1-byte (ASCII)

			
 
				+            *dest++ = byte;

			
 
				+        } else if (byte < 0xC0) {

			
 
				+            // Unexpected continuation byte; insert replacement.

			
 
				+            *dest++ = REPLACEMENT_CHAR;

			
 
				+        } else if (byte < 0xE0) {

			
 
				+            // 2-byte sequence: 110xxxxx 10xxxxxx

			
 
				+            if (p >= end) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                break;

			
 
				+            }

			
 
				+            unsigned char byte2 = *p++;

			
 
				+            if ((byte2 & 0xC0) != 0x80) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                continue;

			
 
				+            }

			
 
				+            codepoint = ((byte & 0x1F) << 6) | (byte2 & 0x3F);

			
 
				+            if (codepoint < 0x80) { // Overlong encoding

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+            } else {

			
 
				+                *dest++ = (unsigned short)codepoint;

			
 
				+            }

			
 
				+        } else if (byte < 0xF0) {

			
 
				+            // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx

			
 
				+            if (p + 1 >= end) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                break;

			
 
				+            }

			
 
				+            unsigned char byte2 = *p++;

			
 
				+            unsigned char byte3 = *p++;

			
 
				+            if ((byte2 & 0xC0) != 0x80 || (byte3 & 0xC0) != 0x80) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                continue;

			
 
				+            }

			
 
				+            codepoint = ((byte & 0x0F) << 12) |

			
 
				+                        ((byte2 & 0x3F) << 6) |

			
 
				+                        (byte3 & 0x3F);

			
 
				+            // Reject overlong sequences and surrogate halves.

			
 
				+            if (codepoint < 0x800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+            } else {

			
 
				+                *dest++ = (unsigned short)codepoint;

			
 
				+            }

			
 
				+        } else if (byte < 0xF8) {

			
 
				+            // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

			
 
				+            if (p + 2 >= end) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                break;

			
 
				+            }

			
 
				+            unsigned char byte2 = *p++;

			
 
				+            unsigned char byte3 = *p++;

			
 
				+            unsigned char byte4 = *p++;

			
 
				+            if ((byte2 & 0xC0) != 0x80 ||

			
 
				+                (byte3 & 0xC0) != 0x80 ||

			
 
				+                (byte4 & 0xC0) != 0x80) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+                continue;

			
 
				+            }

			
 
				+            codepoint = ((byte & 0x07) << 18) |

			
 
				+                        ((byte2 & 0x3F) << 12) |

			
 
				+                        ((byte3 & 0x3F) << 6) |

			
 
				+                        (byte4 & 0x3F);

			
 
				+            // Ensure codepoint is within valid range.

			
 
				+            if (codepoint < 0x10000 || codepoint > 0x10FFFF) {

			
 
				+                *dest++ = REPLACEMENT_CHAR;

			
 
				+            } else {

			
 
				+                // Convert to surrogate pair.

			
 
				+                codepoint -= 0x10000;

			
 
				+                unsigned short highSurrogate = 0xD800 | ((codepoint >> 10) & 0x3FF);

			
 
				+                unsigned short lowSurrogate  = 0xDC00 | (codepoint & 0x3FF);

			
 
				+                *dest++ = highSurrogate;

			
 
				+                *dest++ = lowSurrogate;

			
 
				+            }

			
 
				+        } else {

			
 
				+            // Bytes above 0xF7 are invalid in modern UTF-8.

			
 
				+            *dest++ = REPLACEMENT_CHAR;

			
 
				+        }

			
 
				+    }

			
 
				 

			
 
				-	if( !p || n <= 0 ) return &bbEmptyString;

			
 
				-	

			
 
				-	d=(unsigned short*)malloc( n*2 );

			
 
				-	q=d;

			
 
				-	

			
 
				-	while( n-- && (c=*p++ & 0xff)){

			
 
				-		if( c<0x80 ){

			
 
				-			*q++=c;

			
 
				-		}else{

			
 
				-			if (!n--) break;

			
 
				-			int d=*p++ & 0x3f;

			
 
				-			if( c<0xe0 ){

			
 
				-				*q++=((c&31)<<6) | d;

			
 
				-			}else{

			
 
				-				if (!n--) break;

			
 
				-				int e=*p++ & 0x3f;

			
 
				-				if( c<0xf0 ){

			
 
				-					*q++=((c&15)<<12) | (d<<6) | e;

			
 
				-				}else{

			
 
				-					if (!n--) break;

			
 
				-					int f=*p++ & 0x3f;

			
 
				-					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;

			
 
				-					if( v & 0xffff0000 ) {

			
 
				-						v -= 0x10000;

			
 
				-						d = ((v >> 10) & 0x7ff) + 0xd800;

			
 
				-						e = (v & 0x3ff) + 0xdc00;

			
 
				-						*q++=d;

			
 
				-						*q++=e;

			
 
				-					}else{

			
 
				-						*q++=v;

			
 
				-					}

			
 
				-				}

			
 
				-			}

			
 
				-		}

			
 
				-	}

			
 
				-	str=bbStringFromShorts( d,q-d );

			
 
				-	free( d );

			
 
				-	return str;

			
 
				+    BBString *str = bbStringFromShorts(buffer, dest - buffer);

			
 
				+    free(buffer);

			
 
				+    return str;

			
 
				 }

			
 
				 

			
 
				 BBString *bbStringToString( BBString *t ){

			
--- a/blitz.mod/tests/test.bmx
+++ b/blitz.mod/tests/test.bmx
@@ -330,3 +330,114 @@ Type TStringToIntExTest Extends TTest
 
				 	End Method
			
 
				 
			
 
				 End Type
			
 
				+
			
 
				+Type TStringFromUTF8BytesTest Extends TTest
			
 
				+
			
 
				+    ' Test valid ASCII conversion.
			
 
				+    Method testASCII() { test }
			
 
				+        Local data:Byte[] = [72, 101, 108, 108, 111] ' "Hello"
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals("Hello", text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test conversion of a 2-byte UTF-8 sequence (e.g. ©: U+00A9).
			
 
				+    Method testTwoByteSequence() { test }
			
 
				+        ' © U+00A9: UTF-8: $C2, $A9.
			
 
				+        Local data:Byte[] = [$C2, $A9]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($00A9), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test conversion of a 3-byte UTF-8 sequence (e.g. €: U+20AC).
			
 
				+    Method testThreeByteSequence() { test }
			
 
				+        ' € U+20AC: UTF-8: $E2, $82, $AC.
			
 
				+        Local data:Byte[] = [$E2, $82, $AC]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($20AC), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test conversion of a 4-byte UTF-8 sequence (e.g. U+1F600: grinning face emoji).
			
 
				+    Method testFourByteSequence() { test }
			
 
				+        ' Grinning Face U+1F600: UTF-8: $F0, $9F, $98, $80.
			
 
				+        Local data:Byte[] = [$F0, $9F, $98, $80]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        ' Expected string in UTF-16: surrogate pair (high: $D83D, low: $DE00).
			
 
				+        Local expected:String = Chr($D83D) + Chr($DE00)
			
 
				+        assertEquals(expected, text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test an incomplete sequence (missing continuation bytes).
			
 
				+    Method testIncompleteSequence() { test }
			
 
				+        ' Incomplete 3-byte sequence: [$E2, $82] missing the final byte.
			
 
				+        Local data:Byte[] = [$E2, $82]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        ' Expect a replacement character.
			
 
				+        assertEquals(Chr($FFFD), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test an invalid continuation byte following a valid starter.
			
 
				+    Method testInvalidContinuation() { test }
			
 
				+        ' [$C2, $20]: $20 is not a valid continuation byte.
			
 
				+        Local data:Byte[] = [$C2, $20]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($FFFD), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test a stray continuation byte.
			
 
				+    Method testStrayContinuation() { test }
			
 
				+        ' A single continuation byte $80 is invalid.
			
 
				+        Local data:Byte[] = [$80]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($FFFD), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test a mix of valid and invalid sequences.
			
 
				+    Method testMixedValidInvalid() { test }
			
 
				+        ' "A" ($41), stray continuation ($80), then "B" ($42).
			
 
				+        Local data:Byte[] = [65, $80, 66]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        Local expected:String = Chr(65) + Chr($FFFD) + Chr(66)
			
 
				+        assertEquals(expected, text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test overlong encoding.
			
 
				+    Method testOverlongEncoding() { test }
			
 
				+        ' Overlong encoding for NUL: [$C0, $80] should be rejected.
			
 
				+        Local data:Byte[] = [$C0, $80]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($FFFD), text)
			
 
				+    End Method
			
 
				+
			
 
				+    ' Test a UTF-8 sequence encoding a surrogate half (e.g. U+D800).
			
 
				+    Method testSurrogateHalf() { test }
			
 
				+        ' U+D800 encoded in UTF-8: [$ED, $A0, $80].
			
 
				+        Local data:Byte[] = [$ED, $A0, $80]
			
 
				+        Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+        assertEquals(Chr($FFFD), text)
			
 
				+    End Method
			
 
				+
			
 
				+	' Test conversion of Russian "hello" ("привет").
			
 
				+	Method testRussianHello() { test }
			
 
				+		' "привет": [$D0, $BF, $D1, $80, $D0, $B8, $D0, $B2, $D0, $B5, $D1, $82]
			
 
				+		Local data:Byte[] = [$D0, $BF, $D1, $80, $D0, $B8, $D0, $B2, $D0, $B5, $D1, $82]
			
 
				+		Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+		assertEquals("привет", text)
			
 
				+	End Method
			
 
				+
			
 
				+	' Test conversion of Japanese "hello" ("こんにちは").
			
 
				+	Method testJapaneseHello() { test }
			
 
				+		' "こんにちは": [$E3, $81, $93, $E3, $82, $93, $E3, $81, $AB, $E3, $81, $A1, $E3, $81, $AF]
			
 
				+		Local data:Byte[] = [$E3, $81, $93, $E3, $82, $93, $E3, $81, $AB, $E3, $81, $A1, $E3, $81, $AF]
			
 
				+		Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+		assertEquals("こんにちは", text)
			
 
				+	End Method
			
 
				+
			
 
				+	' Test conversion of Chinese "hello" ("你好").
			
 
				+	Method testChineseHello() { test }
			
 
				+		' "你好": [$E4, $BD, $A0, $E5, $A5, $BD]
			
 
				+		Local data:Byte[] = [$E4, $BD, $A0, $E5, $A5, $BD]
			
 
				+		Local text:String = String.FromUTF8Bytes(data, data.Length)
			
 
				+		assertEquals("你好", text)
			
 
				+	End Method
			
 
				+
			
 
				+End Type