浏览代码

Added basic support for UCS-2 surrogate pairs.

woollybah 6 年之前
父节点
当前提交
59e1fdd71f
共有 2 个文件被更改,包括 63 次插入13 次删除
  1. 22 6
      blitz.mod/blitz_string.c
  2. 41 7
      textstream.mod/textstream.bmx

+ 22 - 6
blitz.mod/blitz_string.c

@@ -264,8 +264,15 @@ BBString *bbStringFromUTF8String( const char *p ){
 				}else{
 				}else{
 					int f=*p++ & 0x3f;
 					int f=*p++ & 0x3f;
 					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
 					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
-					if( v & 0xffff0000 ) bbExThrowCString( "Unicode character out of UCS-2 range" );
-					*q++=v;
+					if( v & 0xffff0000 ) {
+						v -= 0x10000;
+						d = ((v >> 10) & 0x7ffff) + 0xd800;
+						e = (v & 0x3ff) + 0xdc00;
+						*q++=d;
+						*q++=e;
+					}else{
+						*q++=v;
+					}
 				}
 				}
 			}
 			}
 		}
 		}
@@ -760,7 +767,7 @@ BBChar *bbStringToWString( BBString *str ){
 
 
 char *bbStringToUTF8String( BBString *str ){
 char *bbStringToUTF8String( BBString *str ){
 	int i,len=str->length;
 	int i,len=str->length;
-	char *buf=(char*)bbMemAlloc( len*3+1 );
+	char *buf=(char*)bbMemAlloc( len*4+1 );
 	char *q=buf;
 	char *q=buf;
 	unsigned short *p=str->buf;
 	unsigned short *p=str->buf;
 	for( i=0;i<len;++i ){
 	for( i=0;i<len;++i ){
@@ -771,9 +778,18 @@ char *bbStringToUTF8String( BBString *str ){
 			*q++=0xc0|(c>>6);
 			*q++=0xc0|(c>>6);
 			*q++=0x80|(c&0x3f);
 			*q++=0x80|(c&0x3f);
 		}else{
 		}else{
-			*q++=0xe0|(c>>12);
-			*q++=0x80|((c>>6)&0x3f);
-			*q++=0x80|(c&0x3f);
+			if (c < 0xd800 || c > 0xdbff) { 
+				*q++=0xe0|(c>>12);
+				*q++=0x80|((c>>6)&0x3f);
+				*q++=0x80|(c&0x3f);
+			}else{
+				if (i == len - 1) bbExThrowCString( "Invalid UCS-2 character" );
+				c = ((c - 0xd800) << 10) + (*p++ - 0xdc00) + 0x10000;
+				*q++ = 0xf0|(c>>18);
+				*q++ = 0x80|((c>>12)&0x3f);
+				*q++ = 0x80|((c>>6)&0x3f);
+				*q++ = 0x80|((c&0x3f));
+			}
 		}
 		}
 	}
 	}
 	*q=0;
 	*q=0;

+ 41 - 7
textstream.mod/textstream.bmx

@@ -16,12 +16,14 @@ many text processing applications are unable to handle UTF8 and UTF16 files.
 End Rem
 End Rem
 Module BRL.TextStream
 Module BRL.TextStream
 
 
-ModuleInfo "Version: 1.04"
+ModuleInfo "Version: 1.05"
 ModuleInfo "Author: Mark Sibly"
 ModuleInfo "Author: Mark Sibly"
 ModuleInfo "License: zlib/libpng"
 ModuleInfo "License: zlib/libpng"
 ModuleInfo "Copyright: Blitz Research Ltd"
 ModuleInfo "Copyright: Blitz Research Ltd"
 ModuleInfo "Modserver: BRL"
 ModuleInfo "Modserver: BRL"
 
 
+ModuleInfo "History: 1.05"
+ModuleInfo "History: UCS-2 surrogate pairs."
 ModuleInfo "History: 1.04"
 ModuleInfo "History: 1.04"
 ModuleInfo "History: Module is now SuperStrict"
 ModuleInfo "History: Module is now SuperStrict"
 ModuleInfo "History: 1.03 Release"
 ModuleInfo "History: 1.03 Release"
@@ -189,16 +191,34 @@ Type TTextStream Extends TStreamWrapper
 	End Method
 	End Method
 	
 	
 	Method ReadChar:Int()
 	Method ReadChar:Int()
-		Local c:Int=_ReadByte()
+		Local c:Int
+		If _carried Then
+			c = _carried
+			_carried = 0
+			Return c
+		End If
+		
+		c = _ReadByte()
 		Select _encoding
 		Select _encoding
 		Case ETextStreamFormat.LATIN1
 		Case ETextStreamFormat.LATIN1
 			Return c
 			Return c
 		Case ETextStreamFormat.UTF8
 		Case ETextStreamFormat.UTF8
 			If c<128 Return c
 			If c<128 Return c
-			Local d:Int=_ReadByte()
-			If c<224 Return (c-192)*64+(d-128)
-			Local e:Int=_ReadByte()
-			If c<240 Return (c-224)*4096+(d-128)*64+(e-128)
+			Local d:Int=_ReadByte() & $3f
+			If c<224 Return ((c & 31) Shl 6) | d
+			Local e:Int=_ReadByte() & $3f
+			If c<240 Return ((c & 15) Shl 12) | (d Shl 6) | e
+			Local f:Int = _ReadByte() & $3f
+			Local v:Int = ((c & 7) Shl 18) | (d Shl 12) | (e Shl 6) | f
+			If v & $ffff0000 Then
+				v :- $10000
+				d = ((v Shr 10) & $7ffff) + $d800
+				e = (v & $3ff) + $dc00
+				_carried = e
+				Return d
+			Else
+				Return v
+			End If
 		Case ETextStreamFormat.UTF16BE
 		Case ETextStreamFormat.UTF16BE
 			Local d:Int=_ReadByte()
 			Local d:Int=_ReadByte()
 			Return c Shl 8 | d
 			Return c Shl 8 | d
@@ -209,6 +229,16 @@ Type TTextStream Extends TStreamWrapper
 	End Method
 	End Method
 	
 	
 	Method WriteChar( char:Int )
 	Method WriteChar( char:Int )
+		If _carried Then
+			Local c:Int = ((_carried - $d800) Shl 10) + (char - $dc00) + $10000
+			_WriteByte (c Shr 18) | $f0
+			_WriteByte ((c Shr 12) & $3f) | $80
+			_WriteByte ((c Shr 6) & $3f) | $80
+			_WriteByte (c & $3f) | $80
+			_carried = 0
+			Return
+		End If
+	
 		Assert char>=0 And char<=$ffff
 		Assert char>=0 And char<=$ffff
 		Select _encoding
 		Select _encoding
 		Case ETextStreamFormat.LATIN1
 		Case ETextStreamFormat.LATIN1
@@ -219,10 +249,13 @@ Type TTextStream Extends TStreamWrapper
 			Else If char<2048
 			Else If char<2048
 				_WriteByte char/64 | 192
 				_WriteByte char/64 | 192
 				_WriteByte char Mod 64 | 128
 				_WriteByte char Mod 64 | 128
-			Else
+			Else If char < $d800 Or char > $dbff
 				_WriteByte char/4096 | 224
 				_WriteByte char/4096 | 224
 				_WriteByte char/64 Mod 64 | 128
 				_WriteByte char/64 Mod 64 | 128
 				_WriteByte char Mod 64 | 128
 				_WriteByte char Mod 64 | 128
+			Else
+				_carried = char
+				Return
 			EndIf
 			EndIf
 		Case ETextStreamFormat.UTF16BE
 		Case ETextStreamFormat.UTF16BE
 			_WriteByte char Shr 8
 			_WriteByte char Shr 8
@@ -282,6 +315,7 @@ Type TTextStream Extends TStreamWrapper
 	
 	
 	Field _encoding:ETextStreamFormat
 	Field _encoding:ETextStreamFormat
 	Field _bufcount:Int
 	Field _bufcount:Int
+	Field _carried:Int
 	
 	
 End Type
 End Type