%!s(int64=20) %!d(string=hai) anos · 90f44f360a
--- a/mcs/class/corlib/System.Text/ChangeLog
+++ b/mcs/class/corlib/System.Text/ChangeLog
@@ -1,3 +1,11 @@
 
				+2006-02-02  Atsushi Enomoto  <[email protected]>
			
 
				+
			
 
				+	* UTF8Encoding.cs :
			
 
				+	  actually leftover bits needs more careful handleding. So it's
			
 
				+	  better to be rather close to the original GetBytes().
			
 
				+	  Also changed leftOver handling so that it will be able to support
			
 
				+	  fallback.
			
 
				+
			
 
				 2006-02-02  Atsushi Enomoto  <[email protected]>
			
 
				 
			
 
				 	* UTF8Encoding.cs :
			
--- a/mcs/class/corlib/System.Text/UTF8Encoding.cs
+++ b/mcs/class/corlib/System.Text/UTF8Encoding.cs
@@ -74,7 +74,7 @@ public class UTF8Encoding : Encoding
 
				 
			
 
				 	// Internal version of "GetByteCount" which can handle a rolling
			
 
				 	// state between multiple calls to this method.
			
 
				-	private static int InternalGetByteCount (char[] chars, int index, int count, ref uint leftOver, bool flush)
			
 
				+	private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
			
 
				 	{
			
 
				 		// Validate the parameters.
			
 
				 		if (chars == null) {
			
@@ -88,9 +88,9 @@ public class UTF8Encoding : Encoding
 
				 		}
			
 
				 
			
 
				 		if (index == chars.Length) {
			
 
				-			if (flush && leftOver != 0) {
			
 
				+			if (flush && leftOver != '\0') {
			
 
				 				// Flush the left-over surrogate pair start.
			
 
				-				leftOver = 0;
			
 
				+				leftOver = '\0';
			
 
				 				return 3;
			
 
				 			}
			
 
				 			return 0;
			
@@ -104,14 +104,14 @@ public class UTF8Encoding : Encoding
 
				 	}
			
 
				 
			
 
				 
			
 
				-	private unsafe static int InternalGetByteCount (char* chars, int count, ref uint leftOver, bool flush)
			
 
				+	private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
			
 
				 	{
			
 
				 		int index = 0;
			
 
				 
			
 
				 		// Determine the lengths of all characters.
			
 
				 		char ch;
			
 
				 		int length = 0;
			
 
				-		uint pair = leftOver;
			
 
				+		char pair = leftOver;
			
 
				 		while (count > 0) {
			
 
				 			ch = chars[index];
			
 
				 			if (pair == 0) {
			
@@ -121,14 +121,23 @@ public class UTF8Encoding : Encoding
 
				 					length += 2;
			
 
				 				} else if (ch >= '\uD800' && ch <= '\uDBFF') {
			
 
				 					// This is the start of a surrogate pair.
			
 
				-					pair = (uint)ch;
			
 
				+					pair = ch;
			
 
				 				} else {
			
 
				 					length += 3;
			
 
				 				}
			
 
				 			} else if (ch >= '\uDC00' && ch <= '\uDFFF') {
			
 
				-				// We have a surrogate pair.
			
 
				-				length += 4;
			
 
				-				pair = 0;
			
 
				+				if (pair != 0) {
			
 
				+					// We have a surrogate pair.
			
 
				+					length += 4;
			
 
				+					pair = '\0';
			
 
				+				} else {
			
 
				+					// We have a surrogate tail without 
			
 
				+					// leading surrogate. In NET_2_0 it
			
 
				+					// uses fallback. In NET_1_1 we output
			
 
				+					// wrong surrogate.
			
 
				+					length += 3;
			
 
				+					pair = '\0';
			
 
				+				}
			
 
				 			} else {
			
 
				 				// We have a surrogate start followed by a
			
 
				 				// regular character.  Technically, this is
			
@@ -136,18 +145,20 @@ public class UTF8Encoding : Encoding
 
				 				// We write out the surrogate start and then
			
 
				 				// re-visit the current character again.
			
 
				 				length += 3;
			
 
				-				pair = 0;
			
 
				+				pair = '\0';
			
 
				 				continue;
			
 
				 			}
			
 
				 			++index;
			
 
				 			--count;
			
 
				 		}
			
 
				-		if (flush && pair != 0) {
			
 
				-			// Flush the left-over surrogate pair start.
			
 
				-			length += 3;
			
 
				+		if (flush) {
			
 
				+			if (pair != '\0')
			
 
				+				// Flush the left-over surrogate pair start.
			
 
				+				length += 3;
			
 
				+			leftOver = '\0';
			
 
				 		}
			
 
				-
			
 
				-		leftOver = pair;
			
 
				+		else
			
 
				+			leftOver = pair;
			
 
				 
			
 
				 		// Return the final length to the caller.
			
 
				 		return length;
			
@@ -156,7 +167,7 @@ public class UTF8Encoding : Encoding
 
				 	// Get the number of bytes needed to encode a character buffer.
			
 
				 	public override int GetByteCount (char[] chars, int index, int count)
			
 
				 	{
			
 
				-		uint dummy = 0;
			
 
				+		char dummy = '\0';
			
 
				 		return InternalGetByteCount (chars, index, count, ref dummy, true);
			
 
				 	}
			
 
				 
			
@@ -170,7 +181,7 @@ public class UTF8Encoding : Encoding
 
				 
			
 
				 		unsafe {
			
 
				 			fixed (char* cptr = s) {
			
 
				-				uint dummy = 0;
			
 
				+				char dummy = '\0';
			
 
				 				return InternalGetByteCount (cptr, s.Length, ref dummy, true);
			
 
				 			}
			
 
				 		}
			
@@ -184,7 +195,7 @@ public class UTF8Encoding : Encoding
 
				 	// state between multiple calls to this method.
			
 
				 	private static int InternalGetBytes (char[] chars, int charIndex,
			
 
				 					     int charCount, byte[] bytes,
			
 
				-					     int byteIndex, ref uint leftOver,
			
 
				+					     int byteIndex, ref char leftOver,
			
 
				 					     bool flush)
			
 
				 	{
			
 
				 		// Validate the parameters.
			
@@ -210,7 +221,7 @@ public class UTF8Encoding : Encoding
 
				 				bytes [byteIndex++] = 0xEF;
			
 
				 				bytes [byteIndex++] = 0xBB;
			
 
				 				bytes [byteIndex++] = 0xBF;
			
 
				-				leftOver = 0;
			
 
				+				leftOver = '\0';
			
 
				 				return 3;
			
 
				 			}
			
 
				 			return 0;
			
@@ -230,7 +241,7 @@ public class UTF8Encoding : Encoding
 
				 
			
 
				 	private unsafe static int InternalGetBytes (char* chars, int charCount,
			
 
				 					     byte* bytes, int byteCount,
			
 
				-					     ref uint leftOver, bool flush)
			
 
				+					     ref char leftOver, bool flush)
			
 
				 	{
			
 
				 		int charIndex = 0;
			
 
				 		int byteIndex = 0;
			
@@ -239,77 +250,113 @@ public class UTF8Encoding : Encoding
 
				 		// Convert the characters into bytes.
			
 
				 		char ch;
			
 
				 		int length = byteCount;
			
 
				-		uint pair = leftOver;
			
 
				+		char pair = leftOver;
			
 
				 		int posn = byteIndex;
			
 
				+		int code = 0;
			
 
				 
			
 
				 		while (charCount > 0) {
			
 
				 			// Fetch the next UTF-16 character pair value.
			
 
				-			ch = chars [charIndex++];
			
 
				-			if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
			
 
				-				// This may be the start of a surrogate pair.
			
 
				-				pair = (uint) chars [charIndex];
			
 
				-				if (pair >= 0xDC00 && pair <= 0xDFFF) {
			
 
				-					pair = pair - 0xDC00 +
			
 
				-						(((uint) ch - 0xD800) << 10) +
			
 
				-						0x10000;
			
 
				+			ch = chars [charIndex];
			
 
				+			if (pair == '\0') {
			
 
				+				if (ch < '\uD800' || ch >= '\uE000')
			
 
				+					code = ch;
			
 
				+				else if (ch < '\uDC00') {
			
 
				+					// surrogate start
			
 
				+					pair = ch;
			
 
				 					++charIndex;
			
 
				 					--charCount;
			
 
				-				} else {
			
 
				-					pair = (uint) ch;
			
 
				+					continue;
			
 
				+				} else { // ch <= '\uDFFF'
			
 
				+					// We have a surrogate tail without leading 
			
 
				+					// surrogate. In NET_2_0 it uses fallback.
			
 
				+					// In NET_1_1 we output wrong surrogate.
			
 
				+					if ((posn + 3) > length) {
			
 
				+						throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				+					}
			
 
				+					bytes [posn++] = (byte) (0xE0 | (ch >> 12));
			
 
				+					bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
			
 
				+					bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
			
 
				+					++charIndex;
			
 
				+					--charCount;
			
 
				+					continue;
			
 
				 				}
			
 
				 			} else {
			
 
				-				pair = (uint) ch;
			
 
				+				if ('\uDC00' <= ch && ch <= '\uDFFF')
			
 
				+					code =  0x10000 + (int) ch - 0xDC00 +
			
 
				+						(((int) pair - 0xD800) << 10);
			
 
				+				else {
			
 
				+					// We have a surrogate start followed by a
			
 
				+					// regular character.  Technically, this is
			
 
				+					// invalid, but we have to do something.
			
 
				+					// We write out the surrogate start and then
			
 
				+					// re-visit the current character again.
			
 
				+					if ((posn + 3) > length) {
			
 
				+						throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				+					}
			
 
				+					bytes [posn++] = (byte) (0xE0 | (pair >> 12));
			
 
				+					bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
			
 
				+					bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				+					pair = '\0';
			
 
				+					continue;
			
 
				+				}
			
 
				+				pair = '\0';
			
 
				 			}
			
 
				+			++charIndex;
			
 
				 			--charCount;
			
 
				 
			
 
				 			// Encode the character pair value.
			
 
				-			if (pair < 0x0080) {
			
 
				+			if (code < 0x0080) {
			
 
				 				if (posn >= length)
			
 
				 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				-				bytes [posn++] = (byte)pair;
			
 
				-			} else if (pair < 0x0800) {
			
 
				+				bytes [posn++] = (byte)code;
			
 
				+			} else if (code < 0x0800) {
			
 
				 				if ((posn + 2) > length)
			
 
				 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				-				bytes [posn++] = (byte) (0xC0 | (pair >> 6));
			
 
				-				bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				-			} else if (pair < 0x10000) {
			
 
				+				bytes [posn++] = (byte) (0xC0 | (code >> 6));
			
 
				+				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
			
 
				+			} else if (code < 0x10000) {
			
 
				 				if ((posn + 3) > length)
			
 
				 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				-				bytes [posn++] = (byte) (0xE0 | (pair >> 12));
			
 
				-				bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
			
 
				-				bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0xE0 | (code >> 12));
			
 
				+				bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
			
 
				 			} else {
			
 
				 				if ((posn + 4) > length)
			
 
				 					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				-				bytes [posn++] = (byte) (0xF0 | (pair >> 18));
			
 
				-				bytes [posn++] = (byte) (0x80 | ((pair >> 12) & 0x3F));
			
 
				-				bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
			
 
				-				bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0xF0 | (code >> 18));
			
 
				+				bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0x80 | (code & 0x3F));
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		if (flush && pair >= 0xD800 && pair < 0xDC00) {
			
 
				-			// Flush the left-over surrogate pair start.
			
 
				-			if ((posn + 3) > length) {
			
 
				-				throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				+		if (flush) {
			
 
				+			if (pair != '\0') {
			
 
				+				// Flush the left-over incomplete surrogate.
			
 
				+				if ((posn + 3) > length) {
			
 
				+					throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
			
 
				+				}
			
 
				+				bytes [posn++] = (byte) (0xE0 | (pair >> 12));
			
 
				+				bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
			
 
				+				bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				 			}
			
 
				-			bytes [posn++] = (byte) (0xE0 | (pair >> 12));
			
 
				-			bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
			
 
				-			bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
			
 
				-			leftOver = 0;
			
 
				+			leftOver = '\0';
			
 
				 		}
			
 
				-		else
			
 
				-			leftOver = pair;
			
 
				 
			
 
				 		// Return the final count to the caller.
			
 
				 		return posn - byteIndex;
			
 
				 	}
			
 
				 
			
 
				+	private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
			
 
				+	{
			
 
				+		throw new NotImplementedException ();
			
 
				+	}
			
 
				+
			
 
				 	// Get the bytes that result from encoding a character buffer.
			
 
				 	public override int GetBytes (char[] chars, int charIndex, int charCount,
			
 
				 								 byte[] bytes, int byteIndex)
			
 
				 	{
			
 
				-		uint leftOver = 0;
			
 
				+		char leftOver = '\0';
			
 
				 		return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
			
 
				 	}
			
 
				 
			
@@ -340,7 +387,7 @@ public class UTF8Encoding : Encoding
 
				 		unsafe {
			
 
				 			fixed (char* cptr = s) {
			
 
				 				fixed (byte *bptr = bytes) {
			
 
				-					uint dummy = 0;
			
 
				+					char dummy = '\0';
			
 
				 					return InternalGetBytes (
			
 
				 						cptr + charIndex, charCount,
			
 
				 						bptr + byteIndex, bytes.Length - byteIndex,
			
@@ -920,15 +967,15 @@ public class UTF8Encoding : Encoding
 
				 	private class UTF8Encoder : Encoder
			
 
				 	{
			
 
				 		private bool emitIdentifier;
			
 
				-		private uint leftOverForCount;
			
 
				-		private uint leftOverForConv;
			
 
				+		private char leftOverForCount;
			
 
				+		private char leftOverForConv;
			
 
				 
			
 
				 		// Constructor.
			
 
				 		public UTF8Encoder (bool emitIdentifier)
			
 
				 		{
			
 
				 			this.emitIdentifier = emitIdentifier;
			
 
				-			leftOverForCount = 0;
			
 
				-			leftOverForConv = 0;
			
 
				+			leftOverForCount = '\0';
			
 
				+			leftOverForConv = '\0';
			
 
				 		}
			
 
				 
			
 
				 		// Override inherited methods.