Browse Source

EncoderFallback support in UTF8Encoding. Fixed bug #565129 and regression in d961246.

Patch mostly by JB Evain.
Atsushi Eno 15 năm trước cách đây
mục cha
commit
f4aff310cf

+ 76 - 35
mcs/class/corlib/System.Text/UTF8Encoding.cs

@@ -31,7 +31,6 @@ using System.Runtime.InteropServices;
 
 [Serializable]
 [MonoLimitation ("Serialization format not compatible with .NET")]
-[MonoLimitation ("EncoderFallback is not handled")]
 [ComVisible (true)]
 public class UTF8Encoding : Encoding
 {
@@ -51,9 +50,9 @@ public class UTF8Encoding : Encoding
 	{
 		emitIdentifier = encoderShouldEmitUTF8Identifier;
 		if (throwOnInvalidBytes)
-			SetFallbackInternal (null, DecoderFallback.ExceptionFallback);
+			SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
 		else
-			SetFallbackInternal (null, DecoderFallback.StandardSafeFallback);
+			SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
 
 		web_name = body_name = header_name = "utf-8";
 		encoding_name = "Unicode (UTF-8)";
@@ -68,7 +67,7 @@ public class UTF8Encoding : Encoding
 
 	// Internal version of "GetByteCount" which can handle a rolling
 	// state between multiple calls to this method.
-	private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
+	private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
 	{
 		// Validate the parameters.
 		if (chars == null) {
@@ -92,15 +91,17 @@ public class UTF8Encoding : Encoding
 
 		unsafe {
 			fixed (char* cptr = chars) {
-				return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
+				return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
 			}
 		}
 	}
 
-	private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
+	private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
 	{
 		int length = 0;
 		char* end = chars + count;
+		char* start = chars;
+		EncoderFallbackBuffer buffer = null;
 		while (chars < end) {
 			if (leftOver == 0) {
 				for (; chars < end; chars++) {
@@ -126,7 +127,12 @@ public class UTF8Encoding : Encoding
 						// leading surrogate. In NET_2_0 it
 						// uses fallback. In NET_1_1 we output
 						// wrong surrogate.
-						length += 3;
+						char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
+						fixed (char *fb_chars = fallback_chars) {
+							char dummy = '\0';
+							length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
+						}
+
 						leftOver = '\0';
 					}
 				}
@@ -141,7 +147,11 @@ public class UTF8Encoding : Encoding
 					// invalid, but we have to do something.
 					// We write out the surrogate start and then
 					// re-visit the current character again.
-					length += 3;
+					char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
+					fixed (char *fb_chars = fallback_chars) {
+						char dummy = '\0';
+						length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
+					}
 				}
 				leftOver = '\0';
 			}
@@ -156,11 +166,27 @@ public class UTF8Encoding : Encoding
 		return length;
 	}
 
+	unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
+	{
+		if (buffer == null)
+			buffer = fallback.CreateFallbackBuffer ();
+
+		buffer.Fallback (*chars, (int) (chars - start));
+
+		char [] fallback_chars = new char [buffer.Remaining];
+		for (int i = 0; i < fallback_chars.Length; i++)
+			fallback_chars [i] = buffer.GetNextChar ();
+
+		buffer.Reset ();
+
+		return fallback_chars;
+	}
+
 	// Get the number of bytes needed to encode a character buffer.
 	public override int GetByteCount (char[] chars, int index, int count)
 	{
 		char dummy = '\0';
-		return InternalGetByteCount (chars, index, count, ref dummy, true);
+		return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
 	}
 
 
@@ -173,7 +199,7 @@ public class UTF8Encoding : Encoding
 		if (count == 0)
 			return 0;
 		char dummy = '\0';
-		return InternalGetByteCount (chars, count, ref dummy, true);
+		return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
 	}
 
 	#endregion
@@ -184,8 +210,9 @@ public class UTF8Encoding : Encoding
 	// state between multiple calls to this method.
 	private static int InternalGetBytes (char[] chars, int charIndex,
 					     int charCount, byte[] bytes,
-					     int byteIndex, ref char leftOver,
-					     bool flush)
+					     int byteIndex,
+						 EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
+						 ref char leftOver, bool flush)
 	{
 		// Validate the parameters.
 		if (chars == null) {
@@ -219,20 +246,23 @@ public class UTF8Encoding : Encoding
 				if (bytes.Length == byteIndex)
 					return InternalGetBytes (
 						cptr + charIndex, charCount, 
-						null, 0, ref leftOver, flush);
+						null, 0, fallback, ref buffer, ref leftOver, flush);
 				fixed (byte *bptr = bytes) {
 					return InternalGetBytes (
 						cptr + charIndex, charCount,
 						bptr + byteIndex, bytes.Length - byteIndex,
+						fallback, ref buffer,
 						ref leftOver, flush);
 				}
 			}
 		}
 	}
 
-	private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, ref char leftOver, bool flush)
+	private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
 	{
 		char* end = chars + count;
+		char* start = chars;
+		byte* start_bytes = bytes;
 		byte* end_bytes = bytes + bcount;
 		while (chars < end) {
 			if (leftOver == 0) {
@@ -265,12 +295,14 @@ public class UTF8Encoding : Encoding
 						// leading surrogate. In NET_2_0 it
 						// uses fallback. In NET_1_1 we output
 						// wrong surrogate.
-						if (bytes + 2 >= end_bytes)
+						char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
+						char dummy = '\0';
+						if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
 							goto fail_no_space;
-						bytes [0] = (byte) (0xE0 | (ch >> 12));
-						bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
-						bytes [2] = (byte) (0x80 | (ch & 0x3F));
-						bytes += 3;
+						fixed (char *fb_chars = fallback_chars) {
+							bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
+						}
+
 						leftOver = '\0';
 					}
 				}
@@ -292,13 +324,15 @@ public class UTF8Encoding : Encoding
 					// invalid, but we have to do something.
 					// We write out the surrogate start and then
 					// re-visit the current character again.
-					int ch = leftOver;
-					if (bytes + 2 >= end_bytes)
+					char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
+					char dummy = '\0';
+					if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
 						goto fail_no_space;
-					bytes [0] = (byte) (0xE0 | (ch >> 12));
-					bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
-					bytes [2] = (byte) (0x80 | (ch & 0x3F));
-					bytes += 3;
+					fixed (char *fb_chars = fallback_chars) {
+						InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
+					}
+
+					leftOver = '\0';
 				}
 				leftOver = '\0';
 			}
@@ -328,7 +362,8 @@ fail_no_space:
 								 byte[] bytes, int byteIndex)
 	{
 		char leftOver = '\0';
-		return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
+		EncoderFallbackBuffer buffer = null;
+		return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
 	}
 
 	// Convenience wrappers for "GetBytes".
@@ -358,14 +393,16 @@ fail_no_space:
 		unsafe {
 			fixed (char* cptr = s) {
 				char dummy = '\0';
+				EncoderFallbackBuffer buffer = null;
 				if (bytes.Length == byteIndex)
 					return InternalGetBytes (
 						cptr + charIndex, charCount,
-						null, 0, ref dummy, true);
+						null, 0, EncoderFallback, ref buffer, ref dummy, true);
 				fixed (byte *bptr = bytes) {
 					return InternalGetBytes (
 						cptr + charIndex, charCount,
 						bptr + byteIndex, bytes.Length - byteIndex,
+						EncoderFallback, ref buffer,
 						ref dummy, true);
 				}
 			}
@@ -389,10 +426,11 @@ fail_no_space:
 			return 0;
 
 		char dummy = '\0';
+		EncoderFallbackBuffer buffer = null;
 		if (byteCount == 0)
-			return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
+			return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
 		else
-			return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
+			return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
 	}
 
 	#endregion
@@ -836,7 +874,7 @@ fail_no_space:
 	// Get a UTF8-specific encoder that is attached to this instance.
 	public override Encoder GetEncoder ()
 	{
-		return new UTF8Encoder (emitIdentifier);
+		return new UTF8Encoder (EncoderFallback, emitIdentifier);
 	}
 
 	// Get the UTF8 preamble.
@@ -924,8 +962,9 @@ fail_no_space:
 		private char leftOverForConv;
 
 		// Constructor.
-		public UTF8Encoder (bool emitIdentifier)
+		public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
 		{
+			Fallback = fallback;
 //			this.emitIdentifier = emitIdentifier;
 			leftOverForCount = '\0';
 			leftOverForConv = '\0';
@@ -935,27 +974,29 @@ fail_no_space:
 		public override int GetByteCount (char[] chars, int index,
 					 int count, bool flush)
 		{
-			return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
+			return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
 		}
 		public override int GetBytes (char[] chars, int charIndex,
 					 int charCount, byte[] bytes, int byteIndex, bool flush)
 		{
 			int result;
-			result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
+			EncoderFallbackBuffer buffer = null;
+			result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
 //			emitIdentifier = false;
 			return result;
 		}
 
 		public unsafe override int GetByteCount (char* chars, int count, bool flush)
 		{
-			return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
+			return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
 		}
 
 		public unsafe override int GetBytes (char* chars, int charCount,
 			byte* bytes, int byteCount, bool flush)
 		{
 			int result;
-			result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
+			EncoderFallbackBuffer buffer = null;
+			result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
 //			emitIdentifier = false;
 			return result;
 		}

+ 28 - 0
mcs/class/corlib/Test/System.Text/UTF8EncodingTest.cs

@@ -1096,6 +1096,34 @@ namespace MonoTests.System.Text
 			} catch (ArgumentException) {
 			}
 		}
+		
+		[Test] // bug #565129
+		public void SufficientByteArray2 ()
+		{
+			var u = Encoding.UTF8;
+			Assert.AreEqual (3, u.GetByteCount ("\uFFFD"), "#1-1");
+			Assert.AreEqual (3, u.GetByteCount ("\uD800"), "#1-2");
+			Assert.AreEqual (3, u.GetByteCount ("\uDC00"), "#1-3");
+			Assert.AreEqual (4, u.GetByteCount ("\uD800\uDC00"), "#1-4");
+			byte [] bytes = new byte [10];
+			Assert.AreEqual (3, u.GetBytes ("\uDC00", 0, 1, bytes, 0), "#1-5"); // was bogus
+
+			Assert.AreEqual (3, u.GetBytes ("\uFFFD").Length, "#2-1");
+			Assert.AreEqual (3, u.GetBytes ("\uD800").Length, "#2-2");
+			Assert.AreEqual (3, u.GetBytes ("\uDC00").Length, "#2-3");
+			Assert.AreEqual (4, u.GetBytes ("\uD800\uDC00").Length, "#2-4");
+
+			for (char c = char.MinValue; c < char.MaxValue; c++) {
+				byte [] bIn;
+				bIn = u.GetBytes (c.ToString ());
+			}
+
+			try {
+				new UTF8Encoding (false, true).GetBytes (new char [] {'\uDF45', '\uD808'}, 0, 2);
+				Assert.Fail ("EncoderFallbackException is expected");
+			} catch (EncoderFallbackException) {
+			}
+		}
 
 #if NET_2_0
 		[Test] // bug #77550