7 年前 · 29b577971f
--- a/dtool/src/dtoolutil/stringDecoder.I
+++ b/dtool/src/dtoolutil/stringDecoder.I
@@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
 
															  *
														
 
															  */
														
 
															 INLINE StringUnicodeDecoder::
														
 
															-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
														
 
															+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
														
 
															 }
														
--- a/dtool/src/dtoolutil/stringDecoder.cxx
+++ b/dtool/src/dtoolutil/stringDecoder.cxx
@@ -26,7 +26,7 @@ StringDecoder::
 
															 /**
														
 
															  * Returns the next character in sequence.
														
 
															  */
														
 
															-int StringDecoder::
														
 
															+char32_t StringDecoder::
														
 
															 get_next_character() {
														
 
															   if (test_eof()) {
														
 
															     return -1;
														
@@ -57,19 +57,20 @@ get_notify_ptr() {
 
															 /*
														
 
															 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
														
 
															-one, two, or three 8-bit bytes, depending on the value of the
														
 
															+one, two, three or four 8-bit bytes, depending on the value of the
														
 
															 character. The following table shows the format of such UTF-8 byte
														
 
															 sequences (where the "free bits" shown by x's in the table are
														
 
															 combined in the order shown, and interpreted from most significant to
														
 
															 least significant):
														
 
															  Binary format of bytes in sequence:
														
 
															-                                        Number of    Maximum expressible
														
 
															- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
														
 
															+                                              Number of    Maximum expressible
														
 
															+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:
														
 
															- 0xxxxxxx                                  7           007F hex   (127)
														
 
															- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
														
 
															- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
														
 
															+ 0xxxxxxx                                         7          007F hex   (127)
														
 
															+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
														
 
															+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
														
 
															+ 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)
														
 
															 The value of each individual byte indicates its UTF-8 function, as follows:
														
@@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
 
															  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
														
 
															  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
														
 
															  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
														
 
															+ F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
														
 
															 */
														
 
															 /**
														
 
															  * Returns the next character in sequence.
														
 
															  */
														
 
															-int StringUtf8Decoder::
														
 
															+char32_t StringUtf8Decoder::
														
 
															 get_next_character() {
														
 
															   unsigned int result;
														
 
															   while (!test_eof()) {
														
@@ -125,6 +127,35 @@ get_next_character() {
 
															       unsigned int three = (unsigned char)_input[_p++];
														
 
															       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
														
 
															       return result;
														
 
															+
														
 
															+    } else if ((result & 0xf8) == 0xf0) {
														
 
															+      // First byte of four.
														
 
															+      if (test_eof()) {
														
 
															+        if (_notify_ptr != nullptr) {
														
 
															+          (*_notify_ptr)
														
 
															+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
														
 
															+        }
														
 
															+        return -1;
														
 
															+      }
														
 
															+      unsigned int two = (unsigned char)_input[_p++];
														
 
															+      if (test_eof()) {
														
 
															+        if (_notify_ptr != nullptr) {
														
 
															+          (*_notify_ptr)
														
 
															+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
														
 
															+        }
														
 
															+        return -1;
														
 
															+      }
														
 
															+      unsigned int three = (unsigned char)_input[_p++];
														
 
															+      if (test_eof()) {
														
 
															+        if (_notify_ptr != nullptr) {
														
 
															+          (*_notify_ptr)
														
 
															+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
														
 
															+        }
														
 
															+        return -1;
														
 
															+      }
														
 
															+      unsigned int four = (unsigned char)_input[_p++];
														
 
															+      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
														
 
															+      return result;
														
 
															     }
														
 
															     // Otherwise--the high bit is set but it is not one of the introductory
														
@@ -144,7 +175,7 @@ get_next_character() {
 
															 /**
														
 
															  * Returns the next character in sequence.
														
 
															  */
														
 
															-int StringUnicodeDecoder::
														
 
															+char32_t StringUtf16Decoder::
														
 
															 get_next_character() {
														
 
															   if (test_eof()) {
														
 
															     return -1;
														
@@ -159,5 +190,33 @@ get_next_character() {
 
															     return -1;
														
 
															   }
														
 
															   unsigned int low = (unsigned char)_input[_p++];
														
 
															-  return ((high << 8) | low);
														
 
															+  int ch = ((high << 8) | low);
														
 
															+
														
 
															+  /*
														
 
															+  using std::swap;
														
 
															+
														
 
															+  if (ch == 0xfffe) {
														
 
															+    // This is a byte-swapped byte-order-marker.  That means we need to swap
														
 
															+    // the endianness of the rest of the stream.
														
 
															+    char *data = (char *)_input.data();
														
 
															+    for (size_t p = _p; p < _input.size() - 1; p += 2) {
														
 
															+      std::swap(data[p], data[p + 1]);
														
 
															+    }
														
 
															+    ch = 0xfeff;
														
 
															+  }
														
 
															+  */
														
 
															+
														
 
															+  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
														
 
															+    // This is a high surrogate.  Look for a subsequent low surrogate.
														
 
															+    unsigned int high = (unsigned char)_input[_p];
														
 
															+    unsigned int low = (unsigned char)_input[_p + 1];
														
 
															+    int ch2 = ((high << 8) | low);
														
 
															+    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
														
 
															+      // Yes, this is a low surrogate.
														
 
															+      _p += 2;
														
 
															+      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
														
 
															+    }
														
 
															+  }
														
 
															+  // No, this is just a regular character, or an unpaired surrogate.
														
 
															+  return ch;
														
 
															 }
														
--- a/dtool/src/dtoolutil/stringDecoder.h
+++ b/dtool/src/dtoolutil/stringDecoder.h
@@ -26,7 +26,7 @@ public:
 
															   INLINE StringDecoder(const std::string &input);
														
 
															   virtual ~StringDecoder();
														
 
															-  virtual int get_next_character();
														
 
															+  virtual char32_t get_next_character();
														
 
															   INLINE bool is_eof();
														
 
															   static void set_notify_ptr(std::ostream *ptr);
														
@@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 
															 public:
														
 
															   INLINE StringUtf8Decoder(const std::string &input);
														
 
															-  virtual int get_next_character();
														
 
															+  virtual char32_t get_next_character();
														
 
															 };
														
 
															 /**
														
 
															  * This decoder extracts characters two at a time to get a plain wide
														
 
															- * character sequence.
														
 
															+ * character sequence.  It supports surrogate pairs.
														
 
															  */
														
 
															-class StringUnicodeDecoder : public StringDecoder {
														
 
															+class StringUtf16Decoder : public StringDecoder {
														
 
															 public:
														
 
															-  INLINE StringUnicodeDecoder(const std::string &input);
														
 
															+  INLINE StringUtf16Decoder(const std::string &input);
														
 
															-  virtual int get_next_character();
														
 
															+  virtual char32_t get_next_character();
														
 
															 };
														
 
															+// Deprecated alias of StringUtf16Encoder.
														
 
															+typedef StringUtf16Decoder StringUnicodeDecoder;
														
 
															+
														
 
															 #include "stringDecoder.I"
														
 
															 #endif
														
--- a/dtool/src/dtoolutil/textEncoder.I
+++ b/dtool/src/dtoolutil/textEncoder.I
@@ -169,8 +169,23 @@ append_text(const std::string &text) {
 
															  * wide character, up to 16 bits in Unicode.
														
 
															  */
														
 
															 INLINE void TextEncoder::
														
 
															-append_unicode_char(int character) {
														
 
															+append_unicode_char(char32_t character) {
														
 
															+#if WCHAR_MAX >= 0x10FFFF
														
 
															+  // wchar_t might be UTF-32.
														
 
															   _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
														
 
															+#else
														
 
															+  if ((character & ~0xffff) == 0) {
														
 
															+    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
														
 
															+  } else {
														
 
															+    // Encode as a surrogate pair.
														
 
															+    uint32_t v = (uint32_t)character - 0x10000u;
														
 
															+    wchar_t wstr[2] = {
														
 
															+      (wchar_t)((v >> 10u) | 0xd800u),
														
 
															+      (wchar_t)((v & 0x3ffu) | 0xdc00u),
														
 
															+    };
														
 
															+    _wtext = get_wtext() + std::wstring(wstr, 2);
														
 
															+  }
														
 
															+#endif
														
 
															   _flags = (_flags | F_got_wtext) & ~F_got_text;
														
 
															   text_changed();
														
 
															 }
														
--- a/dtool/src/dtoolutil/textEncoder.cxx
+++ b/dtool/src/dtoolutil/textEncoder.cxx
@@ -21,7 +21,7 @@ using std::ostream;
 
															 using std::string;
														
 
															 using std::wstring;
														
 
															-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
														
 
															+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
														
 
															 /**
														
 
															  * Adjusts the text stored within the encoder to all uppercase letters
														
@@ -109,11 +109,11 @@ is_wtext() const {
 
															 }
														
 
															 /**
														
 
															- * Encodes a single wide char into a one-, two-, or three-byte string,
														
 
															- * according to the given encoding system.
														
 
															+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
														
 
															+ * string, according to the given encoding system.
														
 
															  */
														
 
															 string TextEncoder::
														
 
															-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
														
 
															+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
														
 
															   switch (encoding) {
														
 
															   case E_iso8859:
														
 
															     if ((ch & ~0xff) == 0) {
														
@@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
 
															       return
														
 
															         string(1, (char)((ch >> 6) | 0xc0)) +
														
 
															         string(1, (char)((ch & 0x3f) | 0x80));
														
 
															-    } else {
														
 
															+    } else if ((ch & ~0xffff) == 0) {
														
 
															       return
														
 
															         string(1, (char)((ch >> 12) | 0xe0)) +
														
 
															         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
														
 
															         string(1, (char)((ch & 0x3f) | 0x80));
														
 
															+    } else {
														
 
															+      return
														
 
															+        string(1, (char)((ch >> 18) | 0xf0)) +
														
 
															+        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
														
 
															+        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
														
 
															+        string(1, (char)((ch & 0x3f) | 0x80));
														
 
															     }
														
 
															-  case E_unicode:
														
 
															-    return
														
 
															-      string(1, (char)(ch >> 8)) +
														
 
															-      string(1, (char)(ch & 0xff));
														
 
															+  case E_utf16be:
														
 
															+    if ((ch & ~0xffff) == 0) {
														
 
															+      // Note that this passes through surrogates and BOMs unharmed.
														
 
															+      return
														
 
															+        string(1, (char)(ch >> 8)) +
														
 
															+        string(1, (char)(ch & 0xff));
														
 
															+    } else {
														
 
															+      // Use a surrogate pair.
														
 
															+      uint32_t v = (uint32_t)ch - 0x10000u;
														
 
															+      uint16_t hi = (v >> 10u) | 0xd800u;
														
 
															+      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
														
 
															+      char encoded[4] = {
														
 
															+        (char)(hi >> 8),
														
 
															+        (char)(hi & 0xff),
														
 
															+        (char)(lo >> 8),
														
 
															+        (char)(lo & 0xff),
														
 
															+      };
														
 
															+      return string(encoded, 4);
														
 
															+    }
														
 
															   }
														
 
															   return "";
														
@@ -169,8 +190,25 @@ string TextEncoder::
 
															 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
														
 
															   string result;
														
 
															-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
														
 
															-    result += encode_wchar(*pi, encoding);
														
 
															+  for (size_t i = 0; i < wtext.size(); ++i) {
														
 
															+    wchar_t ch = wtext[i];
														
 
															+
														
 
															+    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
														
 
															+#if WCHAR_MAX < 0x10FFFF
														
 
															+    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
														
 
															+      // This is a high surrogate.  Look for a subsequent low surrogate.
														
 
															+      wchar_t ch2 = wtext[i + 1];
														
 
															+      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
														
 
															+        // Yes, this is a low surrogate.
														
 
															+        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
														
 
															+        result += encode_wchar(code_point, encoding);
														
 
															+        i++;
														
 
															+        continue;
														
 
															+      }
														
 
															+    }
														
 
															+#endif
														
 
															+
														
 
															+    result += encode_wchar(ch, encoding);
														
 
															   }
														
 
															   return result;
														
@@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
 
															       return decode_text_impl(decoder);
														
 
															     }
														
 
															-  case E_unicode:
														
 
															+  case E_utf16be:
														
 
															     {
														
 
															-      StringUnicodeDecoder decoder(text);
														
 
															+      StringUtf16Decoder decoder(text);
														
 
															       return decode_text_impl(decoder);
														
 
															     }
														
@@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
 
															   wstring result;
														
 
															   // bool expand_amp = get_expand_amp();
														
 
															-  wchar_t character = decoder.get_next_character();
														
 
															+  char32_t character = decoder.get_next_character();
														
 
															   while (!decoder.is_eof()) {
														
 
															     /*
														
 
															     if (character == '&' && expand_amp) {
														
@@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
 
															       character = expand_amp_sequence(decoder);
														
 
															     }
														
 
															     */
														
 
															-    result += character;
														
 
															+    if (character <= WCHAR_MAX) {
														
 
															+      result += character;
														
 
															+    } else {
														
 
															+      // We need to encode this as a surrogate pair.
														
 
															+      uint32_t v = (uint32_t)character - 0x10000u;
														
 
															+      result += (wchar_t)((v >> 10u) | 0xd800u);
														
 
															+      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
														
 
															+    }
														
 
															     character = decoder.get_next_character();
														
 
															   }
														
@@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
 
															   case TextEncoder::E_utf8:
														
 
															     return out << "utf8";
														
 
															-  case TextEncoder::E_unicode:
														
 
															-    return out << "unicode";
														
 
															+  case TextEncoder::E_utf16be:
														
 
															+    return out << "utf16be";
														
 
															   };
														
 
															   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
														
@@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
 
															     encoding = TextEncoder::E_iso8859;
														
 
															   } else if (word == "utf8" || word == "utf-8") {
														
 
															     encoding = TextEncoder::E_utf8;
														
 
															-  } else if (word == "unicode") {
														
 
															-    encoding = TextEncoder::E_unicode;
														
 
															+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
														
 
															+                                  word == "utf16-be" || word == "utf-16-be") {
														
 
															+    encoding = TextEncoder::E_utf16be;
														
 
															   } else {
														
 
															     ostream *notify_ptr = StringDecoder::get_notify_ptr();
														
 
															     if (notify_ptr != nullptr) {
														
--- a/dtool/src/dtoolutil/textEncoder.h
+++ b/dtool/src/dtoolutil/textEncoder.h
@@ -35,7 +35,10 @@ PUBLISHED:
 
															   enum Encoding {
														
 
															     E_iso8859,
														
 
															     E_utf8,
														
 
															-    E_unicode
														
 
															+    E_utf16be,
														
 
															+
														
 
															+    // Deprecated alias for E_utf16be
														
 
															+    E_unicode = E_utf16be,
														
 
															   };
														
 
															   INLINE TextEncoder();
														
@@ -70,7 +73,7 @@ PUBLISHED:
 
															   INLINE std::string get_text(Encoding encoding) const;
														
 
															   INLINE void append_text(const std::string &text);
														
 
															 #endif
														
 
															-  INLINE void append_unicode_char(int character);
														
 
															+  INLINE void append_unicode_char(char32_t character);
														
 
															   INLINE size_t get_num_chars() const;
														
 
															   INLINE int get_unicode_char(size_t index) const;
														
 
															   INLINE void set_unicode_char(size_t index, int character);
														
@@ -103,13 +106,13 @@ PUBLISHED:
 
															   bool is_wtext() const;
														
 
															 #ifdef CPPPARSER
														
 
															-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
														
 
															+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
														
 
															   EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
														
 
															   EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
														
 
															   EXTEND INLINE PyObject *decode_text(PyObject *text) const;
														
 
															   EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
														
 
															 #else
														
 
															-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
														
 
															+  static std::string encode_wchar(char32_t ch, Encoding encoding);
														
 
															   INLINE std::string encode_wtext(const std::wstring &wtext) const;
														
 
															   static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
														
 
															   INLINE std::wstring decode_text(const std::string &text) const;