UTF32Encoding.cs 50 KB


  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. //
  5. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  6. //
  7. using System.Diagnostics;
  8. using System.Runtime.InteropServices;
  9. namespace System.Text
  10. {
  11. // Encodes text into and out of UTF-32. UTF-32 is a way of writing
  12. // Unicode characters with a single storage unit (32 bits) per character,
  13. //
  14. // The UTF-32 byte order mark is simply the Unicode byte order mark
  15. // (0x00FEFF) written in UTF-32 (0x0000FEFF or 0xFFFE0000). The byte order
  16. // mark is used mostly to distinguish UTF-32 text from other encodings, and doesn't
  17. // switch the byte orderings.
  18. public sealed class UTF32Encoding : Encoding
  19. {
  20. /*
  21. words bits UTF-32 representation
  22. ----- ---- -----------------------------------
  23. 1 16 00000000 00000000 xxxxxxxx xxxxxxxx
  24. 2 21 00000000 000xxxxx hhhhhhll llllllll
  25. ----- ---- -----------------------------------
  26. Surrogate:
  27. Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  28. */
  29. // Used by Encoding.UTF32/BigEndianUTF32 for lazy initialization
  30. // The initialization code will not be run until a static member of the class is referenced
  31. internal static readonly UTF32Encoding s_default = new UTF32Encoding(bigEndian: false, byteOrderMark: true);
  32. internal static readonly UTF32Encoding s_bigEndianDefault = new UTF32Encoding(bigEndian: true, byteOrderMark: true);
  33. private readonly bool _emitUTF32ByteOrderMark = false;
  34. private readonly bool _isThrowException = false;
  35. private readonly bool _bigEndian = false;
  36. public UTF32Encoding() : this(false, true)
  37. {
  38. }
  39. public UTF32Encoding(bool bigEndian, bool byteOrderMark) :
  40. base(bigEndian ? 12001 : 12000)
  41. {
  42. _bigEndian = bigEndian;
  43. _emitUTF32ByteOrderMark = byteOrderMark;
  44. }
  45. public UTF32Encoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidCharacters) :
  46. this(bigEndian, byteOrderMark)
  47. {
  48. _isThrowException = throwOnInvalidCharacters;
  49. // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
  50. if (_isThrowException)
  51. SetDefaultFallbacks();
  52. }
  53. internal override void SetDefaultFallbacks()
  54. {
  55. // For UTF-X encodings, we use a replacement fallback with an empty string
  56. if (_isThrowException)
  57. {
  58. this.encoderFallback = EncoderFallback.ExceptionFallback;
  59. this.decoderFallback = DecoderFallback.ExceptionFallback;
  60. }
  61. else
  62. {
  63. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  64. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  65. }
  66. }
  67. // The following methods are copied from EncodingNLS.cs.
  68. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  69. // These should be kept in sync for the following classes:
  70. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  71. // Returns the number of bytes required to encode a range of characters in
  72. // a character array.
  73. //
  74. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  75. // So if you fix this, fix the others. Currently those include:
  76. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  77. // parent method is safe
  78. public override unsafe int GetByteCount(char[] chars, int index, int count)
  79. {
  80. // Validate input parameters
  81. if (chars == null)
  82. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  83. if (index < 0 || count < 0)
  84. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  85. if (chars.Length - index < count)
  86. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  87. // If no input, return 0, avoid fixed empty array problem
  88. if (count == 0)
  89. return 0;
  90. // Just call the pointer version
  91. fixed (char* pChars = chars)
  92. return GetByteCount(pChars + index, count, null);
  93. }
  94. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  95. // So if you fix this, fix the others. Currently those include:
  96. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  97. // parent method is safe
  98. public override unsafe int GetByteCount(string s)
  99. {
  100. // Validate input
  101. if (s == null)
  102. throw new ArgumentNullException(nameof(s));
  103. fixed (char* pChars = s)
  104. return GetByteCount(pChars, s.Length, null);
  105. }
  106. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  107. // So if you fix this, fix the others. Currently those include:
  108. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  109. [CLSCompliant(false)]
  110. public override unsafe int GetByteCount(char* chars, int count)
  111. {
  112. // Validate Parameters
  113. if (chars == null)
  114. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  115. if (count < 0)
  116. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  117. // Call it with empty encoder
  118. return GetByteCount(chars, count, null);
  119. }
  120. // Parent method is safe.
  121. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  122. // So if you fix this, fix the others. Currently those include:
  123. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  124. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  125. byte[] bytes, int byteIndex)
  126. {
  127. if (s == null || bytes == null)
  128. throw new ArgumentNullException(s == null ? nameof(s) : nameof(bytes), SR.ArgumentNull_Array);
  129. if (charIndex < 0 || charCount < 0)
  130. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  131. if (s.Length - charIndex < charCount)
  132. throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
  133. if (byteIndex < 0 || byteIndex > bytes.Length)
  134. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  135. int byteCount = bytes.Length - byteIndex;
  136. fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  137. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  138. }
  139. // Encodes a range of characters in a character array into a range of bytes
  140. // in a byte array. An exception occurs if the byte array is not large
  141. // enough to hold the complete encoding of the characters. The
  142. // GetByteCount method can be used to determine the exact number of
  143. // bytes that will be produced for a given range of characters.
  144. // Alternatively, the GetMaxByteCount method can be used to
  145. // determine the maximum number of bytes that will be produced for a given
  146. // number of characters, regardless of the actual character values.
  147. //
  148. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  149. // So if you fix this, fix the others. Currently those include:
  150. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  151. // parent method is safe
  152. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  153. byte[] bytes, int byteIndex)
  154. {
  155. // Validate parameters
  156. if (chars == null || bytes == null)
  157. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), SR.ArgumentNull_Array);
  158. if (charIndex < 0 || charCount < 0)
  159. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  160. if (chars.Length - charIndex < charCount)
  161. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  162. if (byteIndex < 0 || byteIndex > bytes.Length)
  163. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  164. // If nothing to encode return 0, avoid fixed problem
  165. if (charCount == 0)
  166. return 0;
  167. // Just call pointer version
  168. int byteCount = bytes.Length - byteIndex;
  169. fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  170. // Remember that byteCount is # to decode, not size of array.
  171. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  172. }
  173. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  174. // So if you fix this, fix the others. Currently those include:
  175. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  176. [CLSCompliant(false)]
  177. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  178. {
  179. // Validate Parameters
  180. if (bytes == null || chars == null)
  181. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  182. if (charCount < 0 || byteCount < 0)
  183. throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  184. return GetBytes(chars, charCount, bytes, byteCount, null);
  185. }
  186. // Returns the number of characters produced by decoding a range of bytes
  187. // in a byte array.
  188. //
  189. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  190. // So if you fix this, fix the others. Currently those include:
  191. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  192. // parent method is safe
  193. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  194. {
  195. // Validate Parameters
  196. if (bytes == null)
  197. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  198. if (index < 0 || count < 0)
  199. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  200. if (bytes.Length - index < count)
  201. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  202. // If no input just return 0, fixed doesn't like 0 length arrays.
  203. if (count == 0)
  204. return 0;
  205. // Just call pointer version
  206. fixed (byte* pBytes = bytes)
  207. return GetCharCount(pBytes + index, count, null);
  208. }
  209. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  210. // So if you fix this, fix the others. Currently those include:
  211. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  212. [CLSCompliant(false)]
  213. public override unsafe int GetCharCount(byte* bytes, int count)
  214. {
  215. // Validate Parameters
  216. if (bytes == null)
  217. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  218. if (count < 0)
  219. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  220. return GetCharCount(bytes, count, null);
  221. }
  222. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  223. // So if you fix this, fix the others. Currently those include:
  224. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  225. // parent method is safe
  226. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  227. char[] chars, int charIndex)
  228. {
  229. // Validate Parameters
  230. if (bytes == null || chars == null)
  231. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  232. if (byteIndex < 0 || byteCount < 0)
  233. throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  234. if (bytes.Length - byteIndex < byteCount)
  235. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  236. if (charIndex < 0 || charIndex > chars.Length)
  237. throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
  238. // If no input, return 0 & avoid fixed problem
  239. if (byteCount == 0)
  240. return 0;
  241. // Just call pointer version
  242. int charCount = chars.Length - charIndex;
  243. fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  244. // Remember that charCount is # to decode, not size of array
  245. return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
  246. }
  247. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  248. // So if you fix this, fix the others. Currently those include:
  249. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  250. [CLSCompliant(false)]
  251. public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  252. {
  253. // Validate Parameters
  254. if (bytes == null || chars == null)
  255. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  256. if (charCount < 0 || byteCount < 0)
  257. throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  258. return GetChars(bytes, byteCount, chars, charCount, null);
  259. }
  260. // Returns a string containing the decoded representation of a range of
  261. // bytes in a byte array.
  262. //
  263. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  264. // So if you fix this, fix the others. Currently those include:
  265. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  266. // parent method is safe
  267. public override unsafe string GetString(byte[] bytes, int index, int count)
  268. {
  269. // Validate Parameters
  270. if (bytes == null)
  271. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  272. if (index < 0 || count < 0)
  273. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  274. if (bytes.Length - index < count)
  275. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  276. // Avoid problems with empty input buffer
  277. if (count == 0) return string.Empty;
  278. fixed (byte* pBytes = bytes)
  279. return string.CreateStringFromEncoding(
  280. pBytes + index, count, this);
  281. }
  282. //
  283. // End of standard methods copied from EncodingNLS.cs
  284. //
  285. internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS? encoder)
  286. {
  287. Debug.Assert(chars != null, "[UTF32Encoding.GetByteCount]chars!=null");
  288. Debug.Assert(count >= 0, "[UTF32Encoding.GetByteCount]count >=0");
  289. char* end = chars + count;
  290. char* charStart = chars;
  291. int byteCount = 0;
  292. char highSurrogate = '\0';
  293. // For fallback we may need a fallback buffer
  294. EncoderFallbackBuffer? fallbackBuffer = null;
  295. char* charsForFallback;
  296. if (encoder != null)
  297. {
  298. highSurrogate = encoder._charLeftOver;
  299. fallbackBuffer = encoder.FallbackBuffer;
  300. // We mustn't have left over fallback data when counting
  301. if (fallbackBuffer.Remaining > 0)
  302. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType().ToString() ?? string.Empty));
  303. }
  304. else
  305. {
  306. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  307. }
  308. // Set our internal fallback interesting things.
  309. fallbackBuffer.InternalInitialize(charStart, end, encoder, false);
  310. char ch;
  311. TryAgain:
  312. while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < end)
  313. {
  314. // First unwind any fallback
  315. if (ch == 0)
  316. {
  317. // No fallback, just get next char
  318. ch = *chars;
  319. chars++;
  320. }
  321. // Do we need a low surrogate?
  322. if (highSurrogate != '\0')
  323. {
  324. //
  325. // In previous char, we encounter a high surrogate, so we are expecting a low surrogate here.
  326. //
  327. if (char.IsLowSurrogate(ch))
  328. {
  329. // They're all legal
  330. highSurrogate = '\0';
  331. //
  332. // One surrogate pair will be translated into 4 bytes UTF32.
  333. //
  334. byteCount += 4;
  335. continue;
  336. }
  337. // We are missing our low surrogate, decrement chars and fallback the high surrogate
  338. // The high surrogate may have come from the encoder, but nothing else did.
  339. Debug.Assert(chars > charStart,
  340. "[UTF32Encoding.GetByteCount]Expected chars to have advanced if no low surrogate");
  341. chars--;
  342. // Do the fallback
  343. charsForFallback = chars;
  344. fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
  345. chars = charsForFallback;
  346. // We're going to fallback the old high surrogate.
  347. highSurrogate = '\0';
  348. continue;
  349. }
  350. // Do we have another high surrogate?
  351. if (char.IsHighSurrogate(ch))
  352. {
  353. //
  354. // We'll have a high surrogate to check next time.
  355. //
  356. highSurrogate = ch;
  357. continue;
  358. }
  359. // Check for illegal characters
  360. if (char.IsLowSurrogate(ch))
  361. {
  362. // We have a leading low surrogate, do the fallback
  363. charsForFallback = chars;
  364. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  365. chars = charsForFallback;
  366. // Try again with fallback buffer
  367. continue;
  368. }
  369. // We get to add the character (4 bytes UTF32)
  370. byteCount += 4;
  371. }
  372. // May have to do our last surrogate
  373. if ((encoder == null || encoder.MustFlush) && highSurrogate > 0)
  374. {
  375. // We have to do the fallback for the lonely high surrogate
  376. charsForFallback = chars;
  377. fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
  378. chars = charsForFallback;
  379. highSurrogate = (char)0;
  380. goto TryAgain;
  381. }
  382. // Check for overflows.
  383. if (byteCount < 0)
  384. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
  385. // Shouldn't have anything in fallback buffer for GetByteCount
  386. // (don't have to check _throwOnOverflow for count)
  387. Debug.Assert(fallbackBuffer.Remaining == 0,
  388. "[UTF32Encoding.GetByteCount]Expected empty fallback buffer at end");
  389. // Return our count
  390. return byteCount;
  391. }
  392. internal override unsafe int GetBytes(char* chars, int charCount,
  393. byte* bytes, int byteCount, EncoderNLS? encoder)
  394. {
  395. Debug.Assert(chars != null, "[UTF32Encoding.GetBytes]chars!=null");
  396. Debug.Assert(bytes != null, "[UTF32Encoding.GetBytes]bytes!=null");
  397. Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetBytes]byteCount >=0");
  398. Debug.Assert(charCount >= 0, "[UTF32Encoding.GetBytes]charCount >=0");
  399. char* charStart = chars;
  400. char* charEnd = chars + charCount;
  401. byte* byteStart = bytes;
  402. byte* byteEnd = bytes + byteCount;
  403. char highSurrogate = '\0';
  404. // For fallback we may need a fallback buffer
  405. EncoderFallbackBuffer? fallbackBuffer = null;
  406. char* charsForFallback;
  407. if (encoder != null)
  408. {
  409. highSurrogate = encoder._charLeftOver;
  410. fallbackBuffer = encoder.FallbackBuffer;
  411. // We mustn't have left over fallback data when not converting
  412. if (encoder._throwOnOverflow && fallbackBuffer.Remaining > 0)
  413. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
  414. }
  415. else
  416. {
  417. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  418. }
  419. // Set our internal fallback interesting things.
  420. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  421. char ch;
  422. TryAgain:
  423. while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
  424. {
  425. // First unwind any fallback
  426. if (ch == 0)
  427. {
  428. // No fallback, just get next char
  429. ch = *chars;
  430. chars++;
  431. }
  432. // Do we need a low surrogate?
  433. if (highSurrogate != '\0')
  434. {
  435. //
  436. // In previous char, we encountered a high surrogate, so we are expecting a low surrogate here.
  437. //
  438. if (char.IsLowSurrogate(ch))
  439. {
  440. // Is it a legal one?
  441. uint iTemp = GetSurrogate(highSurrogate, ch);
  442. highSurrogate = '\0';
  443. //
  444. // One surrogate pair will be translated into 4 bytes UTF32.
  445. //
  446. if (bytes + 3 >= byteEnd)
  447. {
  448. // Don't have 4 bytes
  449. if (fallbackBuffer.bFallingBack)
  450. {
  451. fallbackBuffer.MovePrevious(); // Aren't using these 2 fallback chars
  452. fallbackBuffer.MovePrevious();
  453. }
  454. else
  455. {
  456. // If we don't have enough room, then either we should've advanced a while
  457. // or we should have bytes==byteStart and throw below
  458. Debug.Assert(chars > charStart + 1 || bytes == byteStart,
  459. "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
  460. chars -= 2; // Aren't using those 2 chars
  461. }
  462. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  463. highSurrogate = (char)0; // Nothing left over (we backed up to start of pair if supplimentary)
  464. break;
  465. }
  466. if (_bigEndian)
  467. {
  468. *(bytes++) = (byte)(0x00);
  469. *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0
  470. *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF
  471. *(bytes++) = (byte)(iTemp); // Implies & 0xFF
  472. }
  473. else
  474. {
  475. *(bytes++) = (byte)(iTemp); // Implies & 0xFF
  476. *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF
  477. *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0
  478. *(bytes++) = (byte)(0x00);
  479. }
  480. continue;
  481. }
  482. // We are missing our low surrogate, decrement chars and fallback the high surrogate
  483. // The high surrogate may have come from the encoder, but nothing else did.
  484. Debug.Assert(chars > charStart,
  485. "[UTF32Encoding.GetBytes]Expected chars to have advanced if no low surrogate");
  486. chars--;
  487. // Do the fallback
  488. charsForFallback = chars;
  489. fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
  490. chars = charsForFallback;
  491. // We're going to fallback the old high surrogate.
  492. highSurrogate = '\0';
  493. continue;
  494. }
  495. // Do we have another high surrogate?, if so remember it
  496. if (char.IsHighSurrogate(ch))
  497. {
  498. //
  499. // We'll have a high surrogate to check next time.
  500. //
  501. highSurrogate = ch;
  502. continue;
  503. }
  504. // Check for illegal characters (low surrogate)
  505. if (char.IsLowSurrogate(ch))
  506. {
  507. // We have a leading low surrogate, do the fallback
  508. charsForFallback = chars;
  509. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  510. chars = charsForFallback;
  511. // Try again with fallback buffer
  512. continue;
  513. }
  514. // We get to add the character, yippee.
  515. if (bytes + 3 >= byteEnd)
  516. {
  517. // Don't have 4 bytes
  518. if (fallbackBuffer.bFallingBack)
  519. fallbackBuffer.MovePrevious(); // Aren't using this fallback char
  520. else
  521. {
  522. // Must've advanced already
  523. Debug.Assert(chars > charStart,
  524. "[UTF32Encoding.GetBytes]Expected chars to have advanced if normal character");
  525. chars--; // Aren't using this char
  526. }
  527. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  528. break; // Didn't throw, stop
  529. }
  530. if (_bigEndian)
  531. {
  532. *(bytes++) = (byte)(0x00);
  533. *(bytes++) = (byte)(0x00);
  534. *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF
  535. *(bytes++) = (byte)(ch); // Implies & 0xFF
  536. }
  537. else
  538. {
  539. *(bytes++) = (byte)(ch); // Implies & 0xFF
  540. *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF
  541. *(bytes++) = (byte)(0x00);
  542. *(bytes++) = (byte)(0x00);
  543. }
  544. }
  545. // May have to do our last surrogate
  546. if ((encoder == null || encoder.MustFlush) && highSurrogate > 0)
  547. {
  548. // We have to do the fallback for the lonely high surrogate
  549. charsForFallback = chars;
  550. fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
  551. chars = charsForFallback;
  552. highSurrogate = (char)0;
  553. goto TryAgain;
  554. }
  555. // Fix our encoder if we have one
  556. Debug.Assert(highSurrogate == 0 || (encoder != null && !encoder.MustFlush),
  557. "[UTF32Encoding.GetBytes]Expected encoder to be flushed.");
  558. if (encoder != null)
  559. {
  560. // Remember our left over surrogate (or 0 if flushing)
  561. encoder._charLeftOver = highSurrogate;
  562. // Need # chars used
  563. encoder._charsUsed = (int)(chars - charStart);
  564. }
  565. // return the new length
  566. return (int)(bytes - byteStart);
  567. }
  568. internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS? baseDecoder)
  569. {
  570. Debug.Assert(bytes != null, "[UTF32Encoding.GetCharCount]bytes!=null");
  571. Debug.Assert(count >= 0, "[UTF32Encoding.GetCharCount]count >=0");
  572. UTF32Decoder? decoder = (UTF32Decoder?)baseDecoder;
  573. // None so far!
  574. int charCount = 0;
  575. byte* end = bytes + count;
  576. byte* byteStart = bytes;
  577. // Set up decoder
  578. int readCount = 0;
  579. uint iChar = 0;
  580. // For fallback we may need a fallback buffer
  581. DecoderFallbackBuffer? fallbackBuffer = null;
  582. // See if there's anything in our decoder
  583. if (decoder != null)
  584. {
  585. readCount = decoder.readByteCount;
  586. iChar = (uint)decoder.iChar;
  587. fallbackBuffer = decoder.FallbackBuffer;
  588. // Shouldn't have anything in fallback buffer for GetCharCount
  589. // (don't have to check _throwOnOverflow for chars or count)
  590. Debug.Assert(fallbackBuffer.Remaining == 0,
  591. "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at start");
  592. }
  593. else
  594. {
  595. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  596. }
  597. // Set our internal fallback interesting things.
  598. fallbackBuffer.InternalInitialize(byteStart, null);
  599. // Loop through our input, 4 characters at a time!
  600. while (bytes < end && charCount >= 0)
  601. {
  602. // Get our next character
  603. if (_bigEndian)
  604. {
  605. // Scoot left and add it to the bottom
  606. iChar <<= 8;
  607. iChar += *(bytes++);
  608. }
  609. else
  610. {
  611. // Scoot right and add it to the top
  612. iChar >>= 8;
  613. iChar += (uint)(*(bytes++)) << 24;
  614. }
  615. readCount++;
  616. // See if we have all the bytes yet
  617. if (readCount < 4)
  618. continue;
  619. // Have the bytes
  620. readCount = 0;
  621. // See if its valid to encode
  622. if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF))
  623. {
  624. // Need to fall back these 4 bytes
  625. byte[] fallbackBytes;
  626. if (_bigEndian)
  627. {
  628. fallbackBytes = new byte[] {
  629. unchecked((byte)(iChar >> 24)), unchecked((byte)(iChar >> 16)),
  630. unchecked((byte)(iChar >> 8)), unchecked((byte)(iChar)) };
  631. }
  632. else
  633. {
  634. fallbackBytes = new byte[] {
  635. unchecked((byte)(iChar)), unchecked((byte)(iChar >> 8)),
  636. unchecked((byte)(iChar >> 16)), unchecked((byte)(iChar >> 24)) };
  637. }
  638. charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes);
  639. // Ignore the illegal character
  640. iChar = 0;
  641. continue;
  642. }
  643. // Ok, we have something we can add to our output
  644. if (iChar >= 0x10000)
  645. {
  646. // Surrogates take 2
  647. charCount++;
  648. }
  649. // Add the rest of the surrogate or our normal character
  650. charCount++;
  651. // iChar is back to 0
  652. iChar = 0;
  653. }
  654. // See if we have something left over that has to be decoded
  655. if (readCount > 0 && (decoder == null || decoder.MustFlush))
  656. {
  657. // Oops, there's something left over with no place to go.
  658. byte[] fallbackBytes = new byte[readCount];
  659. if (_bigEndian)
  660. {
  661. while (readCount > 0)
  662. {
  663. fallbackBytes[--readCount] = unchecked((byte)iChar);
  664. iChar >>= 8;
  665. }
  666. }
  667. else
  668. {
  669. while (readCount > 0)
  670. {
  671. fallbackBytes[--readCount] = unchecked((byte)(iChar >> 24));
  672. iChar <<= 8;
  673. }
  674. }
  675. charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes);
  676. }
  677. // Check for overflows.
  678. if (charCount < 0)
  679. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
  680. // Shouldn't have anything in fallback buffer for GetCharCount
  681. // (don't have to check _throwOnOverflow for chars or count)
  682. Debug.Assert(fallbackBuffer.Remaining == 0,
  683. "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at end");
  684. // Return our count
  685. return charCount;
  686. }
  687. internal override unsafe int GetChars(byte* bytes, int byteCount,
  688. char* chars, int charCount, DecoderNLS? baseDecoder)
  689. {
  690. Debug.Assert(chars != null, "[UTF32Encoding.GetChars]chars!=null");
  691. Debug.Assert(bytes != null, "[UTF32Encoding.GetChars]bytes!=null");
  692. Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetChars]byteCount >=0");
  693. Debug.Assert(charCount >= 0, "[UTF32Encoding.GetChars]charCount >=0");
  694. UTF32Decoder? decoder = (UTF32Decoder?)baseDecoder;
  695. // None so far!
  696. char* charStart = chars;
  697. char* charEnd = chars + charCount;
  698. byte* byteStart = bytes;
  699. byte* byteEnd = bytes + byteCount;
  700. // See if there's anything in our decoder (but don't clear it yet)
  701. int readCount = 0;
  702. uint iChar = 0;
  703. // For fallback we may need a fallback buffer
  704. DecoderFallbackBuffer? fallbackBuffer = null;
  705. char* charsForFallback;
  706. // See if there's anything in our decoder
  707. if (decoder != null)
  708. {
  709. readCount = decoder.readByteCount;
  710. iChar = (uint)decoder.iChar;
  711. Debug.Assert(baseDecoder != null);
  712. fallbackBuffer = baseDecoder.FallbackBuffer;
  713. // Shouldn't have anything in fallback buffer for GetChars
  714. // (don't have to check _throwOnOverflow for chars)
  715. Debug.Assert(fallbackBuffer.Remaining == 0,
  716. "[UTF32Encoding.GetChars]Expected empty fallback buffer at start");
  717. }
  718. else
  719. {
  720. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  721. }
  722. // Set our internal fallback interesting things.
  723. fallbackBuffer.InternalInitialize(bytes, chars + charCount);
  724. // Loop through our input, 4 characters at a time!
  725. while (bytes < byteEnd)
  726. {
  727. // Get our next character
  728. if (_bigEndian)
  729. {
  730. // Scoot left and add it to the bottom
  731. iChar <<= 8;
  732. iChar += *(bytes++);
  733. }
  734. else
  735. {
  736. // Scoot right and add it to the top
  737. iChar >>= 8;
  738. iChar += (uint)(*(bytes++)) << 24;
  739. }
  740. readCount++;
  741. // See if we have all the bytes yet
  742. if (readCount < 4)
  743. continue;
  744. // Have the bytes
  745. readCount = 0;
  746. // See if its valid to encode
  747. if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF))
  748. {
  749. // Need to fall back these 4 bytes
  750. byte[] fallbackBytes;
  751. if (_bigEndian)
  752. {
  753. fallbackBytes = new byte[] {
  754. unchecked((byte)(iChar >> 24)), unchecked((byte)(iChar >> 16)),
  755. unchecked((byte)(iChar >> 8)), unchecked((byte)(iChar)) };
  756. }
  757. else
  758. {
  759. fallbackBytes = new byte[] {
  760. unchecked((byte)(iChar)), unchecked((byte)(iChar >> 8)),
  761. unchecked((byte)(iChar >> 16)), unchecked((byte)(iChar >> 24)) };
  762. }
  763. // Chars won't be updated unless this works.
  764. charsForFallback = chars;
  765. bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback);
  766. chars = charsForFallback;
  767. if (!fallbackResult)
  768. {
  769. // Couldn't fallback, throw or wait til next time
  770. // We either read enough bytes for bytes-=4 to work, or we're
  771. // going to throw in ThrowCharsOverflow because chars == charStart
  772. Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
  773. "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (bad surrogate)");
  774. bytes -= 4; // get back to where we were
  775. iChar = 0; // Remembering nothing
  776. fallbackBuffer.InternalReset();
  777. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  778. break; // Stop here, didn't throw
  779. }
  780. // Ignore the illegal character
  781. iChar = 0;
  782. continue;
  783. }
  784. // Ok, we have something we can add to our output
  785. if (iChar >= 0x10000)
  786. {
  787. // Surrogates take 2
  788. if (chars >= charEnd - 1)
  789. {
  790. // Throwing or stopping
  791. // We either read enough bytes for bytes-=4 to work, or we're
  792. // going to throw in ThrowCharsOverflow because chars == charStart
  793. Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
  794. "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (surrogate)");
  795. bytes -= 4; // get back to where we were
  796. iChar = 0; // Remembering nothing
  797. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  798. break; // Stop here, didn't throw
  799. }
  800. *(chars++) = GetHighSurrogate(iChar);
  801. iChar = GetLowSurrogate(iChar);
  802. }
  803. // Bounds check for normal character
  804. else if (chars >= charEnd)
  805. {
  806. // Throwing or stopping
  807. // We either read enough bytes for bytes-=4 to work, or we're
  808. // going to throw in ThrowCharsOverflow because chars == charStart
  809. Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
  810. "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (normal char)");
  811. bytes -= 4; // get back to where we were
  812. iChar = 0; // Remembering nothing
  813. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  814. break; // Stop here, didn't throw
  815. }
  816. // Add the rest of the surrogate or our normal character
  817. *(chars++) = (char)iChar;
  818. // iChar is back to 0
  819. iChar = 0;
  820. }
  821. // See if we have something left over that has to be decoded
  822. if (readCount > 0 && (decoder == null || decoder.MustFlush))
  823. {
  824. // Oops, there's something left over with no place to go.
  825. byte[] fallbackBytes = new byte[readCount];
  826. int tempCount = readCount;
  827. if (_bigEndian)
  828. {
  829. while (tempCount > 0)
  830. {
  831. fallbackBytes[--tempCount] = unchecked((byte)iChar);
  832. iChar >>= 8;
  833. }
  834. }
  835. else
  836. {
  837. while (tempCount > 0)
  838. {
  839. fallbackBytes[--tempCount] = unchecked((byte)(iChar >> 24));
  840. iChar <<= 8;
  841. }
  842. }
  843. charsForFallback = chars;
  844. bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback);
  845. chars = charsForFallback;
  846. if (!fallbackResult)
  847. {
  848. // Couldn't fallback.
  849. fallbackBuffer.InternalReset();
  850. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  851. // Stop here, didn't throw, backed up, so still nothing in buffer
  852. }
  853. else
  854. {
  855. // Don't clear our decoder unless we could fall it back.
  856. // If we caught the if above, then we're a convert() and will catch this next time.
  857. readCount = 0;
  858. iChar = 0;
  859. }
  860. }
  861. // Remember any left over stuff, clearing buffer as well for MustFlush
  862. if (decoder != null)
  863. {
  864. decoder.iChar = (int)iChar;
  865. decoder.readByteCount = readCount;
  866. decoder._bytesUsed = (int)(bytes - byteStart);
  867. }
  868. // Shouldn't have anything in fallback buffer for GetChars
  869. // (don't have to check _throwOnOverflow for chars)
  870. Debug.Assert(fallbackBuffer.Remaining == 0,
  871. "[UTF32Encoding.GetChars]Expected empty fallback buffer at end");
  872. // Return our count
  873. return (int)(chars - charStart);
  874. }
  875. private uint GetSurrogate(char cHigh, char cLow)
  876. {
  877. return (((uint)cHigh - 0xD800) * 0x400) + ((uint)cLow - 0xDC00) + 0x10000;
  878. }
  879. private char GetHighSurrogate(uint iChar)
  880. {
  881. return (char)((iChar - 0x10000) / 0x400 + 0xD800);
  882. }
  883. private char GetLowSurrogate(uint iChar)
  884. {
  885. return (char)((iChar - 0x10000) % 0x400 + 0xDC00);
  886. }
  887. public override Decoder GetDecoder()
  888. {
  889. return new UTF32Decoder(this);
  890. }
  891. public override Encoder GetEncoder()
  892. {
  893. return new EncoderNLS(this);
  894. }
  895. public override int GetMaxByteCount(int charCount)
  896. {
  897. if (charCount < 0)
  898. throw new ArgumentOutOfRangeException(nameof(charCount),
  899. SR.ArgumentOutOfRange_NeedNonNegNum);
  900. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  901. long byteCount = (long)charCount + 1;
  902. if (EncoderFallback.MaxCharCount > 1)
  903. byteCount *= EncoderFallback.MaxCharCount;
  904. // 4 bytes per char
  905. byteCount *= 4;
  906. if (byteCount > 0x7fffffff)
  907. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  908. return (int)byteCount;
  909. }
  910. public override int GetMaxCharCount(int byteCount)
  911. {
  912. if (byteCount < 0)
  913. throw new ArgumentOutOfRangeException(nameof(byteCount),
  914. SR.ArgumentOutOfRange_NeedNonNegNum);
  915. // A supplementary character becomes 2 surrogate characters, so 4 input bytes becomes 2 chars,
  916. // plus we may have 1 surrogate char left over if the decoder has 3 bytes in it already for a non-bmp char.
  917. // Have to add another one because 1/2 == 0, but 3 bytes left over could be 2 char surrogate pair
  918. int charCount = (byteCount / 2) + 2;
  919. // Also consider fallback because our input bytes could be out of range of unicode.
  920. // Since fallback would fallback 4 bytes at a time, we'll only fall back 1/2 of MaxCharCount.
  921. if (DecoderFallback.MaxCharCount > 2)
  922. {
  923. // Multiply time fallback size
  924. charCount *= DecoderFallback.MaxCharCount;
  925. // We were already figuring 2 chars per 4 bytes, but fallback will be different #
  926. charCount /= 2;
  927. }
  928. if (charCount > 0x7fffffff)
  929. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  930. return (int)charCount;
  931. }
  932. public override byte[] GetPreamble()
  933. {
  934. if (_emitUTF32ByteOrderMark)
  935. {
  936. // Allocate new array to prevent users from modifying it.
  937. if (_bigEndian)
  938. {
  939. return new byte[4] { 0x00, 0x00, 0xFE, 0xFF };
  940. }
  941. else
  942. {
  943. return new byte[4] { 0xFF, 0xFE, 0x00, 0x00 }; // 00 00 FE FF
  944. }
  945. }
  946. else
  947. return Array.Empty<byte>();
  948. }
  949. public override ReadOnlySpan<byte> Preamble =>
  950. GetType() != typeof(UTF32Encoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UTF32Encoding overrode GetPreamble
  951. !_emitUTF32ByteOrderMark ? default :
  952. _bigEndian ? (ReadOnlySpan<byte>)new byte[4] { 0x00, 0x00, 0xFE, 0xFF } : // uses C# compiler's optimization for static byte[] data
  953. (ReadOnlySpan<byte>)new byte[4] { 0xFF, 0xFE, 0x00, 0x00 };
  954. public override bool Equals(object? value)
  955. {
  956. if (value is UTF32Encoding that)
  957. {
  958. return (_emitUTF32ByteOrderMark == that._emitUTF32ByteOrderMark) &&
  959. (_bigEndian == that._bigEndian) &&
  960. (EncoderFallback.Equals(that.EncoderFallback)) &&
  961. (DecoderFallback.Equals(that.DecoderFallback));
  962. }
  963. return false;
  964. }
  965. public override int GetHashCode()
  966. {
  967. // Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
  968. return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  969. CodePage + (_emitUTF32ByteOrderMark ? 4 : 0) + (_bigEndian ? 8 : 0);
  970. }
  971. private sealed class UTF32Decoder : DecoderNLS
  972. {
  973. // Need a place to store any extra bytes we may have picked up
  974. internal int iChar = 0;
  975. internal int readByteCount = 0;
  976. public UTF32Decoder(UTF32Encoding encoding) : base(encoding)
  977. {
  978. // base calls reset
  979. }
  980. public override void Reset()
  981. {
  982. this.iChar = 0;
  983. this.readByteCount = 0;
  984. if (_fallbackBuffer != null)
  985. _fallbackBuffer.Reset();
  986. }
  987. // Anything left in our decoder?
  988. internal override bool HasState =>
  989. // ReadByteCount is our flag. (iChar==0 doesn't mean much).
  990. this.readByteCount != 0;
  991. }
  992. }
  993. }