unicode.h 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2012 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #ifndef _UNICODE_H_
  23. #define _UNICODE_H_
  24. #ifndef _TORQUE_TYPES_H_
  25. #include "platform/types.h"
  26. #endif
  27. /// Unicode conversion utility functions
  28. ///
  29. /// Some definitions first:
  30. /// - <b>Code Point</b>: a single character of Unicode text. Used to disabmiguate from C char type.
  31. /// - <b>UTF-32</b>: a Unicode encoding format where one code point is always 32 bits wide.
  32. /// This format can in theory contain any Unicode code point that will ever be needed, now or in the future. 4billion+ code points should be enough, right?
  33. /// - <b>UTF-16</b>: a variable length Unicode encoding format where one code point can be
  34. /// either one or two 16-bit code units long.
  35. /// - <b>UTF-8</b>: a variable length Unicode endocing format where one code point can be
  36. /// up to four 8-bit code units long. The first bit of a single byte UTF-8 code point is 0.
  37. /// The first few bits of a multi-byte code point determine the length of the code point.
  38. /// @see http://en.wikipedia.org/wiki/UTF-8
  39. /// - <b>Surrogate Pair</b>: a pair of special UTF-16 code units, that encode a code point
  40. /// that is too large to fit into 16 bits. The surrogate values sit in a special reserved range of Unicode.
  41. /// - <b>Code Unit</b>: a single unit of a variable length Unicode encoded code point.
  42. /// UTF-8 has 8 bit wide code units. UTF-16 has 16 bit wide code units.
  43. /// - <b>BMP</b>: "Basic Multilingual Plane". Unicode values U+0000 - U+FFFF. This range
  44. /// of Unicode contains all the characters for all the languages of the world, that one would
  45. /// usually be interested in. All code points in the BMP are 16 bits wide or less.
  46. /// The current implementation of these conversion functions deals only with the BMP.
  47. /// Any code points above 0xFFFF, the top of the BMP, are replaced with the
  48. /// standard unicode replacement character: 0xFFFD.
  49. /// Any UTF16 surrogates are read correctly, but replaced.
  50. /// UTF-8 code points up to 6 code units wide will be read, but 5+ is illegal,
  51. /// and 4+ is above the BMP, and will be replaced.
  52. /// This means that UTF-8 output is clamped to 3 code units ( bytes ) per code point.
  53. //-----------------------------------------------------------------------------
  54. /// Functions that convert buffers of unicode code points, allocating a buffer.
  55. /// - These functions allocate their own return buffers. You are responsible for
  56. /// calling delete[] on these buffers.
  57. /// - Because they allocate memory, do not use these functions in a tight loop.
  58. /// - These are useful when you need a new long term copy of a string.
  59. UTF16* createUTF16string( const UTF8 *unistring);
  60. UTF8* createUTF8string( const UTF16 *unistring);
  61. //-----------------------------------------------------------------------------
  62. /// Functions that convert buffers of unicode code points, into a provided buffer.
  63. /// - These functions are useful for working on existing buffers.
  64. /// - These cannot convert a buffer in place. If unistring is the same memory as
  65. /// outbuffer, the behavior is undefined.
  66. /// - The converter clamps output to the BMP (Basic Multilingual Plane) .
  67. /// - Conversion to UTF-8 requires a buffer of 3 bytes (U8's) per character, + 1.
  68. /// - Conversion to UTF-16 requires a buffer of 1 U16 (2 bytes) per character, + 1.
  69. /// - Conversion to UTF-32 requires a buffer of 1 U32 (4 bytes) per character, + 1.
  70. /// - UTF-8 only requires 3 bytes per character in the worst case.
  71. /// - Output is null terminated. Be sure to provide 1 extra byte, U16 or U32 for
  72. /// the null terminator, or you will see truncated output.
  73. /// - If the provided buffer is too small, the output will be truncated.
  74. U32 convertUTF8toUTF16N(const UTF8 *unistring, UTF16 *outbuffer, U32 len);
  75. U32 convertUTF16toUTF8N( const UTF16 *unistring, UTF8 *outbuffer, U32 len);
  76. /// Safe conversion function for statically sized buffers.
  77. template <size_t N>
  78. inline U32 convertUTF8toUTF16(const UTF8 *unistring, UTF16 (&outbuffer)[N])
  79. {
  80. return convertUTF8toUTF16N(unistring, outbuffer, (U32) N);
  81. }
  82. /// Safe conversion function for statically sized buffers.
  83. template <size_t N>
  84. inline U32 convertUTF16toUTF8(const UTF16 *unistring, UTF8 (&outbuffer)[N])
  85. {
  86. return convertUTF16toUTF8N(unistring, outbuffer, (U32) N);
  87. }
  88. //-----------------------------------------------------------------------------
  89. /// Functions that converts one unicode codepoint at a time
  90. /// - Since these functions are designed to be used in tight loops, they do not
  91. /// allocate buffers.
  92. /// - oneUTF8toUTF32() and oneUTF16toUTF32() return the converted Unicode code point
  93. /// in *codepoint, and set *unitsWalked to the \# of code units *codepoint took up.
  94. /// The next Unicode code point should start at *(codepoint + *unitsWalked).
  95. /// - oneUTF32toUTF8() requires a 3 byte buffer, and returns the \# of bytes used.
  96. UTF32 oneUTF8toUTF32( const UTF8 *codepoint, U32 *unitsWalked = NULL);
  97. UTF32 oneUTF16toUTF32(const UTF16 *codepoint, U32 *unitsWalked = NULL);
  98. UTF16 oneUTF32toUTF16(const UTF32 codepoint);
  99. U32 oneUTF32toUTF8( const UTF32 codepoint, UTF8 *threeByteCodeunitBuf);
  100. //-----------------------------------------------------------------------------
  101. /// Functions that calculate the length of unicode strings.
  102. /// - Since calculating the length of a UTF8 string is nearly as expensive as
  103. /// converting it to another format, a dStrlen for UTF8 is not provided here.
  104. /// - If *unistring does not point to a null terminated string of the correct type,
  105. /// the behavior is undefined.
  106. U32 dStrlen(const UTF16 *unistring);
  107. U32 dStrlen(const UTF32 *unistring);
  108. //-----------------------------------------------------------------------------
  109. /// Scanning for characters in unicode strings
  110. UTF16* dStrrchr(UTF16* unistring, U32 c);
  111. const UTF16* dStrrchr(const UTF16* unistring, U32 c);
  112. UTF16* dStrchr(UTF16* unistring, U32 c);
  113. const UTF16* dStrchr(const UTF16* unistring, U32 c);
  114. //-----------------------------------------------------------------------------
  115. /// Functions that scan for characters in a utf8 string.
  116. /// - this is useful for getting a character-wise offset into a UTF8 string,
  117. /// as opposed to a byte-wise offset into a UTF8 string: foo[i]
  118. const UTF8* getNthCodepoint(const UTF8 *unistring, const U32 n);
  119. //------------------------------------------------------------------------------
  120. /// Functions to read and validate UTF BOMs (Byte Order Marker)
  121. /// For reference: http://en.wikipedia.org/wiki/Byte_Order_Mark
  122. bool chompUTF8BOM( const char *inString, char **outStringPtr );
  123. bool isValidUTF8BOM( U8 bom[4] );
  124. #endif // _UNICODE_H_