| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459 |
- /**************************************************************************
- Filename : OVR_UTF8Util.cpp
- Content : UTF8 Unicode character encoding/decoding support
- Created : September 19, 2012
- Notes :
- Notes : Much useful info at "UTF-8 and Unicode FAQ"
- http://www.cl.cam.ac.uk/~mgk25/unicode.html
- Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved.
- Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License");
- you may not use the Oculus VR Rift SDK except in compliance with the License,
- which is provided at the time of installation or download, or which
- otherwise accompanies this software in either electronic or hard copy form.
- You may obtain a copy of the License at
- http://www.oculusvr.com/licenses/LICENSE-3.2
- Unless required by applicable law or agreed to in writing, the Oculus VR SDK
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ************************************************************************************/
- #include "OVR_UTF8Util.h"
- #include <wchar.h>
- #include <string.h>
- // sizeof(wchar_t) in preprocessor-accessible form.
- #ifndef OVR_WCHAR_SIZE
- #if defined(__WCHAR_MAX__)
- #if (__WCHAR_MAX__ == 127) || (__WCHAR_MAX__ == 255)
- #define OVR_WCHAR_SIZE 1
- #elif (__WCHAR_MAX__ == 32767) || (__WCHAR_MAX__ == 65535)
- #define OVR_WCHAR_SIZE 2
- #else
- #define OVR_WCHAR_SIZE 4
- #endif
- #elif defined(OVR_OS_UNIX)
- #define OVR_WCHAR_SIZE 4
- #else
- #define OVR_WCHAR_SIZE 2
- #endif
- #endif
- namespace OVR { namespace UTF8Util {
- size_t Strlcpy(char* pDestUTF8, size_t destCharCount, const wchar_t* pSrcUCS, size_t sourceLength)
- {
- if (sourceLength == (size_t)-1)
- sourceLength = wcslen(pSrcUCS);
- size_t destLength = 0, requiredLength = 0;
- for (size_t i = 0; (i < sourceLength); ++i)
- {
- char buff[6]; // longest utf8 encoding just to be safe
- intptr_t count = 0;
- EncodeChar(buff, &count, pSrcUCS[i]);
- // We check requiredLength instead of destLength because we want to make sure that the first time
- // that we fail the 'if' below, we don't succeed at it the next time we execute it (which could
- // otherwise happen if count were a lower number on the subsequent pass -- low enough that the if
- // would succeed).
- if ((requiredLength + count) < destCharCount) // If there is enough space to append count bytes (leaving room for a trailing '\0')...
- {
- memcpy(pDestUTF8 + destLength, buff, count);
- destLength += (size_t)count;
- }
- requiredLength += (size_t)count;
- }
- if (destLength < destCharCount) // Should be true for all cases other than destCharCount == 0.
- pDestUTF8[destLength] = '\0';
- return requiredLength; // Return the intended strlen of pDestUTF8.
- }
- size_t Strlcpy(wchar_t* pDestUCS, size_t destCharCount, const char* pSrcUTF8, size_t sourceLength)
- {
- if (sourceLength == (size_t)-1)
- sourceLength = strlen(pSrcUTF8);
- size_t destLength = 0, requiredLength = 0;
-
- for (const char* pSrcUTF8End = (pSrcUTF8 + sourceLength); pSrcUTF8 < pSrcUTF8End; )
- {
- uint32_t c = DecodeNextChar_Advance0(&pSrcUTF8);
- OVR_ASSERT_M(pSrcUTF8 <= (pSrcUTF8 + sourceLength), "Strlcpy sourceLength was not on a UTF8 boundary.");
- #if (OVR_WCHAR_SIZE == 2)
- if (c >= 0x0000FFFF)
- c = 0x0000FFFD;
- #endif
- if((destLength + 1) < destCharCount) // If there is enough space to append a wchar_t (leaving room for a trailing '\0')...
- {
- pDestUCS[destLength] = wchar_t(c);
- destLength++;
- }
- requiredLength++;
- }
- if (destLength < destCharCount)
- pDestUCS[destLength] = L'\0';
- return requiredLength; // Return the intended wcslen of pDestUCS.
- }
- intptr_t GetLength(const char* buf, intptr_t buflen)
- {
- const char* p = buf;
- intptr_t length = 0;
- if (buflen != -1)
- {
- while (p - buf < buflen)
- {
- // We should be able to have ASStrings with 0 in the middle.
- UTF8Util::DecodeNextChar_Advance0(&p);
- length++;
- }
- }
- else
- {
- while (UTF8Util::DecodeNextChar_Advance0(&p))
- length++;
- }
-
- return length;
- }
- uint32_t GetCharAt(intptr_t index, const char* putf8str, intptr_t length)
- {
- const char* buf = putf8str;
- uint32_t c = 0;
- if (length != -1)
- {
- while (buf - putf8str < length)
- {
- c = UTF8Util::DecodeNextChar_Advance0(&buf);
- if (index == 0)
- return c;
- index--;
- }
- return c;
- }
- do
- {
- c = UTF8Util::DecodeNextChar_Advance0(&buf);
- index--;
- if (c == 0)
- {
- // We've hit the end of the string; don't go further.
- OVR_ASSERT(index == 0);
- return c;
- }
- } while (index >= 0);
- return c;
- }
- intptr_t GetByteIndex(intptr_t index, const char *putf8str, intptr_t length)
- {
- const char* buf = putf8str;
- if (length != -1)
- {
- while ((buf - putf8str) < length && index > 0)
- {
- UTF8Util::DecodeNextChar_Advance0(&buf);
- index--;
- }
- return buf-putf8str;
- }
- while (index > 0)
- {
- uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf);
- index--;
- if (c == 0)
- return buf-putf8str;
- };
- return buf-putf8str;
- }
- int GetEncodeCharSize(uint32_t ucs_character)
- {
- if (ucs_character <= 0x7F)
- return 1;
- else if (ucs_character <= 0x7FF)
- return 2;
- else if (ucs_character <= 0xFFFF)
- return 3;
- else if (ucs_character <= 0x1FFFFF)
- return 4;
- else if (ucs_character <= 0x3FFFFFF)
- return 5;
- else if (ucs_character <= 0x7FFFFFFF)
- return 6;
- else
- return 0;
- }
- uint32_t DecodeNextChar_Advance0(const char** putf8Buffer)
- {
- uint32_t uc;
- char c;
-
- // Security considerations:
- //
- // Changed, this is now only the case for DecodeNextChar:
- // - If we hit a zero byte, we want to return 0 without stepping
- // the buffer pointer past the 0. th
- //
- // If we hit an "overlong sequence"; i.e. a character encoded
- // in a longer multibyte string than is necessary, then we
- // need to discard the character. This is so attackers can't
- // disguise dangerous characters or character sequences --
- // there is only one valid encoding for each character.
- //
- // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
- // 0xFFFF } then we ignore them; they are not valid in UTF-8.
-
- // This isn't actually an invalid character; it's a valid char that
- // looks like an inverted question mark.
- #define INVALID_CHAR 0x0FFFD
-
- #define FIRST_BYTE(mask, shift) \
- uc = (c & (mask)) << (shift);
-
- #define NEXT_BYTE(shift) \
- c = **putf8Buffer; \
- if (c == 0) return 0; /* end of buffer, do not advance */ \
- if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
- (*putf8Buffer)++; \
- uc |= (c & 0x3F) << shift;
-
- c = **putf8Buffer;
- (*putf8Buffer)++;
- if (c == 0)
- return 0; // End of buffer.
-
- if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII.
-
- // Multi-byte sequences.
- if ((c & 0xE0) == 0xC0)
- {
- // Two-byte sequence.
- FIRST_BYTE(0x1F, 6);
- NEXT_BYTE(0);
- if (uc < 0x80) return INVALID_CHAR; // overlong
- return uc;
- }
- else if ((c & 0xF0) == 0xE0)
- {
- // Three-byte sequence.
- FIRST_BYTE(0x0F, 12);
- NEXT_BYTE(6);
- NEXT_BYTE(0);
- if (uc < 0x800) return INVALID_CHAR; // overlong
- // Not valid ISO 10646, but Flash requires these to work
- // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
- // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
- // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
- return uc;
- }
- else if ((c & 0xF8) == 0xF0)
- {
- // Four-byte sequence.
- FIRST_BYTE(0x07, 18);
- NEXT_BYTE(12);
- NEXT_BYTE(6);
- NEXT_BYTE(0);
- if (uc < 0x010000) return INVALID_CHAR; // overlong
- return uc;
- }
- else if ((c & 0xFC) == 0xF8)
- {
- // Five-byte sequence.
- FIRST_BYTE(0x03, 24);
- NEXT_BYTE(18);
- NEXT_BYTE(12);
- NEXT_BYTE(6);
- NEXT_BYTE(0);
- if (uc < 0x0200000) return INVALID_CHAR; // overlong
- return uc;
- }
- else if ((c & 0xFE) == 0xFC)
- {
- // Six-byte sequence.
- FIRST_BYTE(0x01, 30);
- NEXT_BYTE(24);
- NEXT_BYTE(18);
- NEXT_BYTE(12);
- NEXT_BYTE(6);
- NEXT_BYTE(0);
- if (uc < 0x04000000) return INVALID_CHAR; // overlong
- return uc;
- }
- else
- {
- // Invalid.
- return INVALID_CHAR;
- }
- }
- void EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character)
- {
- if (ucs_character <= 0x7F)
- {
- // Plain single-byte ASCII.
- pbuffer[(*pindex)++] = (char) ucs_character;
- }
- else if (ucs_character <= 0x7FF)
- {
- // Two bytes.
- pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
- }
- else if (ucs_character <= 0xFFFF)
- {
- // Three bytes.
- pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
- }
- else if (ucs_character <= 0x1FFFFF)
- {
- // Four bytes.
- pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
- }
- else if (ucs_character <= 0x3FFFFFF)
- {
- // Five bytes.
- pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
- }
- else if (ucs_character <= 0x7FFFFFFF)
- {
- // Six bytes.
- pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
- pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
- }
- else
- {
- // Invalid char; don't encode anything.
- }
- }
- intptr_t GetEncodeStringSize(const wchar_t* pchar, intptr_t length)
- {
- intptr_t len = 0;
- if (length != -1)
- for (int i = 0; i < length; i++)
- {
- len += GetEncodeCharSize(pchar[i]);
- }
- else
- for (int i = 0;; i++)
- {
- if (pchar[i] == 0)
- return len;
- len += GetEncodeCharSize(pchar[i]);
- }
- return len;
- }
- void EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length)
- {
- intptr_t ofs = 0;
- if (length != -1)
- {
- for (int i = 0; i < length; i++)
- {
- EncodeChar(pbuff, &ofs, pchar[i]);
- }
- }
- else
- {
- for (int i = 0;; i++)
- {
- if (pchar[i] == 0)
- break;
- EncodeChar(pbuff, &ofs, pchar[i]);
- }
- }
- pbuff[ofs] = 0;
- }
- size_t DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen)
- {
- wchar_t *pbegin = pbuff;
- if (bytesLen == -1)
- {
- while (1)
- {
- uint32_t ch = DecodeNextChar_Advance0(&putf8str);
- if (ch == 0)
- break;
- else if (ch >= 0xFFFF)
- ch = 0xFFFD;
- *pbuff++ = wchar_t(ch);
- }
- }
- else
- {
- const char* p = putf8str;
- while ((p - putf8str) < bytesLen)
- {
- uint32_t ch = DecodeNextChar_Advance0(&p);
- if (ch >= 0xFFFF)
- ch = 0xFFFD;
- *pbuff++ = wchar_t(ch);
- }
- }
- *pbuff = 0;
- return pbuff - pbegin;
- }
- }} // namespace UTF8Util::OVR
|