Forráskód Böngészése

2007-10-25 Atsushi Enomoto <[email protected]>

	* DecoderReplacementFallbackBuffer.cs : Reset() should also reset the
	  input buffer. When fallback is not assigned, just return '\0'.
	* UnicodeEncoding.cs : handle throwOnInvalid .ctor argument.
	  Default replacement fallback buffer is now "\uFFFD".
	* UTF8Encoding.cs : couple of replacement buffer size fixes.
	  Default replacement fallback buffer is now "\uFFFD".
	* UTF32Encoding.cs : Default replacement is "\uFFFD" too here.
	  See http://support.microsoft.com/kb/940521/ for this change.

	* UnicodeEncodingTest.cs,
	  UTF8EncodingTest.cs,
	  DecoderReplacementFallbackBufferTest.cs,
	  DecoderReplacementFallbackTest.cs : default replacement buffer fix.
	  Added test for Reset() for replacement buffer.


svn path=/trunk/mcs/; revision=88150
Atsushi Eno 18 éve
szülő
commit
f4071fef2b

+ 11 - 0
mcs/class/corlib/System.Text/ChangeLog

@@ -1,3 +1,14 @@
+2007-10-25  Atsushi Enomoto  <[email protected]>
+
+	* DecoderReplacementFallbackBuffer.cs : Reset() should also reset the
+	  input buffer. When fallback is not assigned, just return '\0'.
+	* UnicodeEncoding.cs : handle throwOnInvalid .ctor argument.
+	  Default replacement fallback buffer is now "\uFFFD".
+	* UTF8Encoding.cs : couple of replacement buffer size fixes.
+	  Default replacement fallback buffer is now "\uFFFD".
+	* UTF32Encoding.cs : Default replacement is "\uFFFD" too here.
+	  See http://support.microsoft.com/kb/940521/ for this change.
+
 2007-08-15  Jb Evain  <[email protected]>
 
 	* StringBuilder: hide non 2.1 AppendFormat on 2.1 so that

+ 4 - 1
mcs/class/corlib/System.Text/DecoderReplacementFallbackBuffer.cs

@@ -53,7 +53,7 @@ namespace System.Text
 		}
 
 		public override int Remaining {
-			get { return replacement.Length - current; }
+			get { return fallback_assigned ? replacement.Length - current : 0; }
 		}
 
 		public override bool Fallback (byte [] bytesUnknown, int index)
@@ -72,6 +72,8 @@ namespace System.Text
 
 		public override char GetNextChar ()
 		{
+			if (!fallback_assigned)
+				return '\0';
 			if (current >= replacement.Length)
 				return char.MinValue;
 			return replacement [current++];
@@ -87,6 +89,7 @@ namespace System.Text
 
 		public override void Reset ()
 		{
+			fallback_assigned = false;
 			current = 0;
 		}
 	}

+ 2 - 2
mcs/class/corlib/System.Text/UTF32Encoding.cs

@@ -69,8 +69,8 @@ public sealed class UTF32Encoding : Encoding
 			SetFallbackInternal (EncoderFallback.ExceptionFallback,
 				DecoderFallback.ExceptionFallback);
 		else
-			SetFallbackInternal (EncoderFallback.ReplacementFallback,
-				DecoderFallback.ReplacementFallback);
+			SetFallbackInternal (new EncoderReplacementFallback ("\uFFFD"),
+				new DecoderReplacementFallback ("\uFFFD"));
 
 		if (bigEndian){
 			body_name = "utf-32BE";

+ 29 - 21
mcs/class/corlib/System.Text/UTF8Encoding.cs

@@ -59,7 +59,7 @@ public class UTF8Encoding : Encoding
 		if (throwOnInvalidBytes)
 			SetFallbackInternal (null, new DecoderExceptionFallback ());
 		else
-			SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
+			SetFallbackInternal (null, new DecoderReplacementFallback ("\uFFFD"));
 #else
 		throwOnInvalid = throwOnInvalidBytes;
 #endif
@@ -585,7 +585,7 @@ Char.IsLetterOrDigit (pair);
 				} else {
 					// Invalid UTF-8 start character.
 #if NET_2_0
-					length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
+					length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
 #else
 					if (throwOnInvalid)
 						throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -619,7 +619,7 @@ Char.IsLetterOrDigit (pair);
 							}
 							if (overlong) {
 #if NET_2_0
-								length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
+								length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
 #else
 								if (throwOnInvalid)
 									throw new ArgumentException (_("Overlong"), leftBits.ToString ());
@@ -631,7 +631,7 @@ Char.IsLetterOrDigit (pair);
 							length += 2;
 						} else {
 #if NET_2_0
-							length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
+							length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
 #else
 							if (throwOnInvalid)
 								throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -642,7 +642,7 @@ Char.IsLetterOrDigit (pair);
 				} else {
 					// Invalid UTF-8 sequence: clear and restart.
 #if NET_2_0
-					length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
+					length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
 #else
 					if (throwOnInvalid)
 						throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -657,7 +657,7 @@ Char.IsLetterOrDigit (pair);
 			// We had left-over bytes that didn't make up
 			// a complete UTF-8 character sequence.
 #if NET_2_0
-			length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index);
+			length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
 #else
 			if (throwOnInvalid)
 				throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -670,7 +670,7 @@ Char.IsLetterOrDigit (pair);
 
 #if NET_2_0
 	// for GetCharCount()
-	static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index)
+	static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
 	{
 		if (buffer == null) {
 			DecoderFallback fb = provider as DecoderFallback;
@@ -681,13 +681,18 @@ Char.IsLetterOrDigit (pair);
 		}
 		if (bufferArg == null)
 			bufferArg = new byte [1];
-		bufferArg [0] = bytes [index];
-		buffer.Fallback (bufferArg, 0);
-		return buffer.Remaining;
+		int ret = 0;
+		for (int i = 0; i < size; i++) {
+			bufferArg [0] = bytes [(int) index + i];
+			buffer.Fallback (bufferArg, 0);
+			ret += buffer.Remaining;
+			buffer.Reset ();
+		}
+		return ret;
 	}
 
 	// for GetChars()
-	static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex,
+	static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
 		char* chars, ref int charIndex)
 	{
 		if (buffer == null) {
@@ -699,10 +704,13 @@ Char.IsLetterOrDigit (pair);
 		}
 		if (bufferArg == null)
 			bufferArg = new byte [1];
-		bufferArg [0] = bytes [byteIndex];
-		buffer.Fallback (bufferArg, 0);
-		while (buffer.Remaining > 0)
-			chars [charIndex++] = buffer.GetNextChar ();
+		for (int i = 0; i < size; i++) {
+			bufferArg [0] = bytes [byteIndex + i];
+			buffer.Fallback (bufferArg, 0);
+			while (buffer.Remaining > 0)
+				chars [charIndex++] = buffer.GetNextChar ();
+			buffer.Reset ();
+		}
 	}
 #endif
 
@@ -853,7 +861,7 @@ Char.IsLetterOrDigit (pair);
 				} else {
 					// Invalid UTF-8 start character.
 #if NET_2_0
-					Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+					Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
 #else
 					if (throwOnInvalid)
 						throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -887,7 +895,7 @@ Char.IsLetterOrDigit (pair);
 							}
 							if (overlong) {
 #if NET_2_0
-								Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+								Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
 #else
 								if (throwOnInvalid)
 									throw new ArgumentException (_("Overlong"), leftBits.ToString ());
@@ -896,7 +904,7 @@ Char.IsLetterOrDigit (pair);
 							else if ((leftBits & 0xF800) == 0xD800) {
 								// UTF-8 doesn't use surrogate characters
 #if NET_2_0
-								Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+								Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
 #else
 								if (throwOnInvalid)
 									throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -921,7 +929,7 @@ Char.IsLetterOrDigit (pair);
 								(char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
 						} else {
 #if NET_2_0
-							Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+							Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
 #else
 							if (throwOnInvalid)
 								throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -932,7 +940,7 @@ Char.IsLetterOrDigit (pair);
 				} else {
 					// Invalid UTF-8 sequence: clear and restart.
 #if NET_2_0
-					Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+					Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
 #else
 					if (throwOnInvalid)
 						throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
@@ -946,7 +954,7 @@ Char.IsLetterOrDigit (pair);
 			// We had left-over bytes that didn't make up
 			// a complete UTF-8 character sequence.
 #if NET_2_0
-			Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
+			Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
 #else
 			if (throwOnInvalid)
 				throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");

+ 17 - 8
mcs/class/corlib/System.Text/UnicodeEncoding.cs

@@ -58,8 +58,25 @@ public class UnicodeEncoding : Encoding
 		byteOrderMark = true;
 	}
 	public UnicodeEncoding (bool bigEndian, bool byteOrderMark)
+		: this (bigEndian, byteOrderMark, false)
+	{
+	}
+
+#if NET_2_0
+	public
+#endif
+	UnicodeEncoding (bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
 		: base ((bigEndian ? BIG_UNICODE_CODE_PAGE : UNICODE_CODE_PAGE))
 	{
+#if NET_2_0
+		if (throwOnInvalidBytes)
+			SetFallbackInternal (null, new DecoderExceptionFallback ());
+		else
+			SetFallbackInternal (null, new DecoderReplacementFallback ("\uFFFD"));
+#else
+		throwOnInvalid = throwOnInvalidBytes;
+#endif
+
 		this.bigEndian = bigEndian;
 		this.byteOrderMark = byteOrderMark;
 
@@ -82,14 +99,6 @@ public class UnicodeEncoding : Encoding
 		windows_code_page = UNICODE_CODE_PAGE;
 	}
 
-#if NET_2_0
-	[MonoTODO ("Implement throwOnInvalidBytes")]
-	public UnicodeEncoding (bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
-		: this (bigEndian, byteOrderMark)
-	{
-	}
-#endif
-
 	// Get the number of bytes needed to encode a character buffer.
 	public override int GetByteCount (char[] chars, int index, int count)
 	{

+ 8 - 0
mcs/class/corlib/Test/System.Text/ChangeLog

@@ -1,3 +1,11 @@
+2007-10-25  Atsushi Enomoto  <[email protected]>
+
+	* UnicodeEncodingTest.cs,
+	  UTF8EncodingTest.cs,
+	  DecoderReplacementFallbackBufferTest.cs,
+	  DecoderReplacementFallbackTest.cs : default replacement buffer fix.
+	  Added test for Reset() for replacement buffer.
+
 2007-10-16  Gert Driesen  <[email protected]>
 
 	* TestEncoding.cs: Also make class available on 1.0 profile.

+ 25 - 1
mcs/class/corlib/Test/System.Text/DecoderReplacementFallbackBufferTest.cs

@@ -40,9 +40,19 @@ namespace MonoTests.System.Text
 		}
 
 		[Test]
-		public void FallbackEmptyForEncodingUTF8 ()
+		public void FallbackDefaultEncodingUTF8 ()
 		{
 			Buffer b = Encoding.UTF8.DecoderFallback.CreateFallbackBuffer () as Buffer;
+			Assert.IsTrue (b.Fallback (new byte [] {}, 0), "#1");
+			Assert.IsFalse (b.MovePrevious (), "#2");
+			Assert.AreEqual (1, b.Remaining, "#3");
+			Assert.AreEqual ('\uFFFD', b.GetNextChar (), "#4");
+		}
+
+		[Test]
+		public void FallbackEmptyForEncodingUTF8 ()
+		{
+			Buffer b = new DecoderReplacementFallbackBuffer (new DecoderReplacementFallback (String.Empty));
 			Assert.IsFalse (b.Fallback (new byte [] {}, 0), "#1");
 			Assert.IsFalse (b.MovePrevious (), "#2");
 			Assert.AreEqual (0, b.Remaining, "#3");
@@ -85,6 +95,20 @@ namespace MonoTests.System.Text
 			Assert.IsFalse (b.MovePrevious (), "#8");
 //			Assert.AreEqual ('?', b.GetNextChar (), "#9");
 		}
+
+		[Test]
+		public void Reset ()
+		{
+			DecoderReplacementFallback f = new DecoderReplacementFallback ("X");
+			DecoderReplacementFallbackBuffer b = new DecoderReplacementFallbackBuffer (f);
+			b.Fallback (new byte [0], 0);
+			Assert.AreEqual (1, b.Remaining, "#1");
+			b.Reset ();
+			Assert.AreEqual (0, b.Remaining, "#2");
+			b.Fallback (new byte [0], 0); // do not raise an error
+			b.Reset ();
+			Assert.AreEqual (0, (int) b.GetNextChar (), "#3");
+		}
 	}
 }
 

+ 4 - 2
mcs/class/corlib/Test/System.Text/DecoderReplacementFallbackTest.cs

@@ -33,8 +33,10 @@ namespace MonoTests.System.Text
 
 			f = Encoding.UTF8.DecoderFallback as DecoderReplacementFallback;
 			Assert.IsNotNull (f, "#5");
-			Assert.AreEqual (String.Empty, f.DefaultString, "#6");
-			Assert.AreEqual (0, f.MaxCharCount, "#7");
+			// This behavior was introduced as
+			// http://support.microsoft.com/kb/940521/
+			Assert.AreEqual ("\uFFFD", f.DefaultString, "#6");
+			Assert.AreEqual (1, f.MaxCharCount, "#7");
 
 			// after beta2 this test became invalid.
 			//f = new MyEncoding ().DecoderFallback as DecoderReplacementFallback;

+ 22 - 7
mcs/class/corlib/Test/System.Text/UTF8EncodingTest.cs

@@ -155,16 +155,19 @@ namespace MonoTests.System.Text
 			UTF8Encoding u = new UTF8Encoding (true, false);
 
 			byte[] data = new byte [] { 0xC0, 0xAF };
+			AssertEquals ("#A0", 2, u.GetCharCount (data));
 			string s = u.GetString (data);
-			AssertEquals ("#A1", 0, s.Length);
+			AssertEquals ("#A1", "\uFFFD\uFFFD", s);
 
 			data = new byte [] { 0x30, 0x31, 0xC0, 0xAF, 0x30, 0x32 };
 			s = u.GetString (data);
-			AssertEquals ("#B1", 4, s.Length);
+			AssertEquals ("#B1", 6, s.Length);
 			AssertEquals ("#B2", 0x30, (int) s [0]);
 			AssertEquals ("#B3", 0x31, (int) s [1]);
-			AssertEquals ("#B4", 0x30, (int) s [2]);
-			AssertEquals ("#B5", 0x32, (int) s [3]);
+			AssertEquals ("#B4", 0xFFFD, (int) s [2]);
+			AssertEquals ("#B5", 0xFFFD, (int) s [3]);
+			AssertEquals ("#B6", 0x30, (int) s [4]);
+			AssertEquals ("#B7", 0x32, (int) s [5]);
 		}
 
 		// UTF8 decoding tests from http://www.cl.cam.ac.uk/~mgk25/
@@ -1078,16 +1081,28 @@ namespace MonoTests.System.Text
 		public void DecoderFallbackSimple ()
 		{
 			UTF8Encoding e = new UTF8Encoding (false, false);
-			AssertType.AreEqual (0, e.GetDecoder ().GetCharCount (
+			AssertType.AreEqual (1, e.GetDecoder ().GetCharCount (
 					new byte [] {(byte) 183}, 0, 1),
 					"#1");
-			AssertType.AreEqual (0, e.GetDecoder().GetChars (
+			AssertType.AreEqual (1, e.GetDecoder().GetChars (
 					new byte [] {(byte) 183}, 0, 1,
 					new char [100], 0),
 					"#2");
-			AssertType.AreEqual (0, e.GetString (new byte [] {(byte) 183}).Length,
+			AssertType.AreEqual (1, e.GetString (new byte [] {(byte) 183}).Length,
 					"#3");
 		}
+
+		[Test]
+		public void FallbackDefaultEncodingUTF8 ()
+		{
+			DecoderReplacementFallbackBuffer b =
+				Encoding.UTF8.DecoderFallback.CreateFallbackBuffer ()
+				as DecoderReplacementFallbackBuffer;
+			AssertType.IsTrue (b.Fallback (new byte [] {}, 0), "#1");
+			AssertType.IsFalse (b.MovePrevious (), "#2");
+			AssertType.AreEqual (1, b.Remaining, "#3");
+			AssertType.AreEqual ('\uFFFD', b.GetNextChar (), "#4");
+		}
 #endif
 	}
 }

+ 2 - 1
mcs/class/corlib/Test/System.Text/UnicodeEncodingTest.cs

@@ -172,7 +172,8 @@ namespace MonoTests.System.Text
                         UnicodeEncoding UnicodeEnc = new UnicodeEncoding ();
 #if NET_2_0
                         // where is this extra 1 coming from?
-                        Assertion.AssertEquals ("UTF #1", 25, UnicodeEnc.GetMaxCharCount(51));
+                        Assertion.AssertEquals ("UTF #1", 26, UnicodeEnc.GetMaxCharCount(50));
+                        Assertion.AssertEquals ("UTF #2", 27, UnicodeEnc.GetMaxCharCount(51));
 #else
                         Assertion.AssertEquals ("UTF #1", 25, UnicodeEnc.GetMaxCharCount(50));
 #endif