7 年之前 · 29b577971f
--- a/dtool/src/dtoolutil/stringDecoder.I
+++ b/dtool/src/dtoolutil/stringDecoder.I
@@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
 
				  *
			
 
				  */
			
 
				 INLINE StringUnicodeDecoder::
			
 
				-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
			
 
				+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
			
 
				 }
			
--- a/dtool/src/dtoolutil/stringDecoder.cxx
+++ b/dtool/src/dtoolutil/stringDecoder.cxx
@@ -26,7 +26,7 @@ StringDecoder::
 
				 /**
			
 
				  * Returns the next character in sequence.
			
 
				  */
			
 
				-int StringDecoder::
			
 
				+char32_t StringDecoder::
			
 
				 get_next_character() {
			
 
				   if (test_eof()) {
			
 
				     return -1;
			
@@ -57,19 +57,20 @@ get_notify_ptr() {
 
				 
			
 
				 /*
			
 
				 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
			
 
				-one, two, or three 8-bit bytes, depending on the value of the
			
 
				+one, two, three or four 8-bit bytes, depending on the value of the
			
 
				 character. The following table shows the format of such UTF-8 byte
			
 
				 sequences (where the "free bits" shown by x's in the table are
			
 
				 combined in the order shown, and interpreted from most significant to
			
 
				 least significant):
			
 
				 
			
 
				  Binary format of bytes in sequence:
			
 
				-                                        Number of    Maximum expressible
			
 
				- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
			
 
				+                                              Number of    Maximum expressible
			
 
				+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:
			
 
				 
			
 
				- 0xxxxxxx                                  7           007F hex   (127)
			
 
				- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
			
 
				- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
			
 
				+ 0xxxxxxx                                         7          007F hex   (127)
			
 
				+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
			
 
				+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
			
 
				+ 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)
			
 
				 
			
 
				 The value of each individual byte indicates its UTF-8 function, as follows:
			
 
				 
			
@@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
 
				  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
			
 
				  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
			
 
				  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
			
 
				+ F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
			
 
				 */
			
 
				 
			
 
				 /**
			
 
				  * Returns the next character in sequence.
			
 
				  */
			
 
				-int StringUtf8Decoder::
			
 
				+char32_t StringUtf8Decoder::
			
 
				 get_next_character() {
			
 
				   unsigned int result;
			
 
				   while (!test_eof()) {
			
@@ -125,6 +127,35 @@ get_next_character() {
 
				       unsigned int three = (unsigned char)_input[_p++];
			
 
				       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
			
 
				       return result;
			
 
				+
			
 
				+    } else if ((result & 0xf8) == 0xf0) {
			
 
				+      // First byte of four.
			
 
				+      if (test_eof()) {
			
 
				+        if (_notify_ptr != nullptr) {
			
 
				+          (*_notify_ptr)
			
 
				+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
			
 
				+        }
			
 
				+        return -1;
			
 
				+      }
			
 
				+      unsigned int two = (unsigned char)_input[_p++];
			
 
				+      if (test_eof()) {
			
 
				+        if (_notify_ptr != nullptr) {
			
 
				+          (*_notify_ptr)
			
 
				+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
			
 
				+        }
			
 
				+        return -1;
			
 
				+      }
			
 
				+      unsigned int three = (unsigned char)_input[_p++];
			
 
				+      if (test_eof()) {
			
 
				+        if (_notify_ptr != nullptr) {
			
 
				+          (*_notify_ptr)
			
 
				+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
			
 
				+        }
			
 
				+        return -1;
			
 
				+      }
			
 
				+      unsigned int four = (unsigned char)_input[_p++];
			
 
				+      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
			
 
				+      return result;
			
 
				     }
			
 
				 
			
 
				     // Otherwise--the high bit is set but it is not one of the introductory
			
@@ -144,7 +175,7 @@ get_next_character() {
 
				 /**
			
 
				  * Returns the next character in sequence.
			
 
				  */
			
 
				-int StringUnicodeDecoder::
			
 
				+char32_t StringUtf16Decoder::
			
 
				 get_next_character() {
			
 
				   if (test_eof()) {
			
 
				     return -1;
			
@@ -159,5 +190,33 @@ get_next_character() {
 
				     return -1;
			
 
				   }
			
 
				   unsigned int low = (unsigned char)_input[_p++];
			
 
				-  return ((high << 8) | low);
			
 
				+  int ch = ((high << 8) | low);
			
 
				+
			
 
				+  /*
			
 
				+  using std::swap;
			
 
				+
			
 
				+  if (ch == 0xfffe) {
			
 
				+    // This is a byte-swapped byte-order-marker.  That means we need to swap
			
 
				+    // the endianness of the rest of the stream.
			
 
				+    char *data = (char *)_input.data();
			
 
				+    for (size_t p = _p; p < _input.size() - 1; p += 2) {
			
 
				+      std::swap(data[p], data[p + 1]);
			
 
				+    }
			
 
				+    ch = 0xfeff;
			
 
				+  }
			
 
				+  */
			
 
				+
			
 
				+  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
			
 
				+    // This is a high surrogate.  Look for a subsequent low surrogate.
			
 
				+    unsigned int high = (unsigned char)_input[_p];
			
 
				+    unsigned int low = (unsigned char)_input[_p + 1];
			
 
				+    int ch2 = ((high << 8) | low);
			
 
				+    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
			
 
				+      // Yes, this is a low surrogate.
			
 
				+      _p += 2;
			
 
				+      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
			
 
				+    }
			
 
				+  }
			
 
				+  // No, this is just a regular character, or an unpaired surrogate.
			
 
				+  return ch;
			
 
				 }
			
--- a/dtool/src/dtoolutil/stringDecoder.h
+++ b/dtool/src/dtoolutil/stringDecoder.h
@@ -26,7 +26,7 @@ public:
 
				   INLINE StringDecoder(const std::string &input);
			
 
				   virtual ~StringDecoder();
			
 
				 
			
 
				-  virtual int get_next_character();
			
 
				+  virtual char32_t get_next_character();
			
 
				   INLINE bool is_eof();
			
 
				 
			
 
				   static void set_notify_ptr(std::ostream *ptr);
			
@@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 
				 public:
			
 
				   INLINE StringUtf8Decoder(const std::string &input);
			
 
				 
			
 
				-  virtual int get_next_character();
			
 
				+  virtual char32_t get_next_character();
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * This decoder extracts characters two at a time to get a plain wide
			
 
				- * character sequence.
			
 
				+ * character sequence.  It supports surrogate pairs.
			
 
				  */
			
 
				-class StringUnicodeDecoder : public StringDecoder {
			
 
				+class StringUtf16Decoder : public StringDecoder {
			
 
				 public:
			
 
				-  INLINE StringUnicodeDecoder(const std::string &input);
			
 
				+  INLINE StringUtf16Decoder(const std::string &input);
			
 
				 
			
 
				-  virtual int get_next_character();
			
 
				+  virtual char32_t get_next_character();
			
 
				 };
			
 
				 
			
 
				+// Deprecated alias of StringUtf16Encoder.
			
 
				+typedef StringUtf16Decoder StringUnicodeDecoder;
			
 
				+
			
 
				 #include "stringDecoder.I"
			
 
				 
			
 
				 #endif
			
--- a/dtool/src/dtoolutil/textEncoder.I
+++ b/dtool/src/dtoolutil/textEncoder.I
@@ -169,8 +169,23 @@ append_text(const std::string &text) {
 
				  * wide character, up to 16 bits in Unicode.
			
 
				  */
			
 
				 INLINE void TextEncoder::
			
 
				-append_unicode_char(int character) {
			
 
				+append_unicode_char(char32_t character) {
			
 
				+#if WCHAR_MAX >= 0x10FFFF
			
 
				+  // wchar_t might be UTF-32.
			
 
				   _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
			
 
				+#else
			
 
				+  if ((character & ~0xffff) == 0) {
			
 
				+    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
			
 
				+  } else {
			
 
				+    // Encode as a surrogate pair.
			
 
				+    uint32_t v = (uint32_t)character - 0x10000u;
			
 
				+    wchar_t wstr[2] = {
			
 
				+      (wchar_t)((v >> 10u) | 0xd800u),
			
 
				+      (wchar_t)((v & 0x3ffu) | 0xdc00u),
			
 
				+    };
			
 
				+    _wtext = get_wtext() + std::wstring(wstr, 2);
			
 
				+  }
			
 
				+#endif
			
 
				   _flags = (_flags | F_got_wtext) & ~F_got_text;
			
 
				   text_changed();
			
 
				 }
			
--- a/dtool/src/dtoolutil/textEncoder.cxx
+++ b/dtool/src/dtoolutil/textEncoder.cxx
@@ -21,7 +21,7 @@ using std::ostream;
 
				 using std::string;
			
 
				 using std::wstring;
			
 
				 
			
 
				-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
			
 
				+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
			
 
				 
			
 
				 /**
			
 
				  * Adjusts the text stored within the encoder to all uppercase letters
			
@@ -109,11 +109,11 @@ is_wtext() const {
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * Encodes a single wide char into a one-, two-, or three-byte string,
			
 
				- * according to the given encoding system.
			
 
				+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
			
 
				+ * string, according to the given encoding system.
			
 
				  */
			
 
				 string TextEncoder::
			
 
				-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
			
 
				+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
			
 
				   switch (encoding) {
			
 
				   case E_iso8859:
			
 
				     if ((ch & ~0xff) == 0) {
			
@@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
 
				       return
			
 
				         string(1, (char)((ch >> 6) | 0xc0)) +
			
 
				         string(1, (char)((ch & 0x3f) | 0x80));
			
 
				-    } else {
			
 
				+    } else if ((ch & ~0xffff) == 0) {
			
 
				       return
			
 
				         string(1, (char)((ch >> 12) | 0xe0)) +
			
 
				         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
			
 
				         string(1, (char)((ch & 0x3f) | 0x80));
			
 
				+    } else {
			
 
				+      return
			
 
				+        string(1, (char)((ch >> 18) | 0xf0)) +
			
 
				+        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
			
 
				+        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
			
 
				+        string(1, (char)((ch & 0x3f) | 0x80));
			
 
				     }
			
 
				 
			
 
				-  case E_unicode:
			
 
				-    return
			
 
				-      string(1, (char)(ch >> 8)) +
			
 
				-      string(1, (char)(ch & 0xff));
			
 
				+  case E_utf16be:
			
 
				+    if ((ch & ~0xffff) == 0) {
			
 
				+      // Note that this passes through surrogates and BOMs unharmed.
			
 
				+      return
			
 
				+        string(1, (char)(ch >> 8)) +
			
 
				+        string(1, (char)(ch & 0xff));
			
 
				+    } else {
			
 
				+      // Use a surrogate pair.
			
 
				+      uint32_t v = (uint32_t)ch - 0x10000u;
			
 
				+      uint16_t hi = (v >> 10u) | 0xd800u;
			
 
				+      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
			
 
				+      char encoded[4] = {
			
 
				+        (char)(hi >> 8),
			
 
				+        (char)(hi & 0xff),
			
 
				+        (char)(lo >> 8),
			
 
				+        (char)(lo & 0xff),
			
 
				+      };
			
 
				+      return string(encoded, 4);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   return "";
			
@@ -169,8 +190,25 @@ string TextEncoder::
 
				 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
			
 
				   string result;
			
 
				 
			
 
				-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
			
 
				-    result += encode_wchar(*pi, encoding);
			
 
				+  for (size_t i = 0; i < wtext.size(); ++i) {
			
 
				+    wchar_t ch = wtext[i];
			
 
				+
			
 
				+    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
			
 
				+#if WCHAR_MAX < 0x10FFFF
			
 
				+    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
			
 
				+      // This is a high surrogate.  Look for a subsequent low surrogate.
			
 
				+      wchar_t ch2 = wtext[i + 1];
			
 
				+      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
			
 
				+        // Yes, this is a low surrogate.
			
 
				+        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
			
 
				+        result += encode_wchar(code_point, encoding);
			
 
				+        i++;
			
 
				+        continue;
			
 
				+      }
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    result += encode_wchar(ch, encoding);
			
 
				   }
			
 
				 
			
 
				   return result;
			
@@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
 
				       return decode_text_impl(decoder);
			
 
				     }
			
 
				 
			
 
				-  case E_unicode:
			
 
				+  case E_utf16be:
			
 
				     {
			
 
				-      StringUnicodeDecoder decoder(text);
			
 
				+      StringUtf16Decoder decoder(text);
			
 
				       return decode_text_impl(decoder);
			
 
				     }
			
 
				 
			
@@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
 
				   wstring result;
			
 
				   // bool expand_amp = get_expand_amp();
			
 
				 
			
 
				-  wchar_t character = decoder.get_next_character();
			
 
				+  char32_t character = decoder.get_next_character();
			
 
				   while (!decoder.is_eof()) {
			
 
				     /*
			
 
				     if (character == '&' && expand_amp) {
			
@@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
 
				       character = expand_amp_sequence(decoder);
			
 
				     }
			
 
				     */
			
 
				-    result += character;
			
 
				+    if (character <= WCHAR_MAX) {
			
 
				+      result += character;
			
 
				+    } else {
			
 
				+      // We need to encode this as a surrogate pair.
			
 
				+      uint32_t v = (uint32_t)character - 0x10000u;
			
 
				+      result += (wchar_t)((v >> 10u) | 0xd800u);
			
 
				+      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
			
 
				+    }
			
 
				     character = decoder.get_next_character();
			
 
				   }
			
 
				 
			
@@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
 
				   case TextEncoder::E_utf8:
			
 
				     return out << "utf8";
			
 
				 
			
 
				-  case TextEncoder::E_unicode:
			
 
				-    return out << "unicode";
			
 
				+  case TextEncoder::E_utf16be:
			
 
				+    return out << "utf16be";
			
 
				   };
			
 
				 
			
 
				   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
			
@@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
 
				     encoding = TextEncoder::E_iso8859;
			
 
				   } else if (word == "utf8" || word == "utf-8") {
			
 
				     encoding = TextEncoder::E_utf8;
			
 
				-  } else if (word == "unicode") {
			
 
				-    encoding = TextEncoder::E_unicode;
			
 
				+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
			
 
				+                                  word == "utf16-be" || word == "utf-16-be") {
			
 
				+    encoding = TextEncoder::E_utf16be;
			
 
				   } else {
			
 
				     ostream *notify_ptr = StringDecoder::get_notify_ptr();
			
 
				     if (notify_ptr != nullptr) {
			
--- a/dtool/src/dtoolutil/textEncoder.h
+++ b/dtool/src/dtoolutil/textEncoder.h
@@ -35,7 +35,10 @@ PUBLISHED:
 
				   enum Encoding {
			
 
				     E_iso8859,
			
 
				     E_utf8,
			
 
				-    E_unicode
			
 
				+    E_utf16be,
			
 
				+
			
 
				+    // Deprecated alias for E_utf16be
			
 
				+    E_unicode = E_utf16be,
			
 
				   };
			
 
				 
			
 
				   INLINE TextEncoder();
			
@@ -70,7 +73,7 @@ PUBLISHED:
 
				   INLINE std::string get_text(Encoding encoding) const;
			
 
				   INLINE void append_text(const std::string &text);
			
 
				 #endif
			
 
				-  INLINE void append_unicode_char(int character);
			
 
				+  INLINE void append_unicode_char(char32_t character);
			
 
				   INLINE size_t get_num_chars() const;
			
 
				   INLINE int get_unicode_char(size_t index) const;
			
 
				   INLINE void set_unicode_char(size_t index, int character);
			
@@ -103,13 +106,13 @@ PUBLISHED:
 
				   bool is_wtext() const;
			
 
				 
			
 
				 #ifdef CPPPARSER
			
 
				-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
			
 
				+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
			
 
				   EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
			
 
				   EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
			
 
				   EXTEND INLINE PyObject *decode_text(PyObject *text) const;
			
 
				   EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
			
 
				 #else
			
 
				-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
			
 
				+  static std::string encode_wchar(char32_t ch, Encoding encoding);
			
 
				   INLINE std::string encode_wtext(const std::wstring &wtext) const;
			
 
				   static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
			
 
				   INLINE std::wstring decode_text(const std::string &text) const;