ソースを参照

dtoolutil: improve Unicode encoding/decoding, support non-BMP chars

- Support encoding and decoding four-byte UTF-8 sequences
- E_unicode supports surrogate pairs, renamed to E_utf16be for clarity
- char32_t should be used for storing a Unicode code point
rdb 7 年 前
コミット
29b577971f

+ 1 - 1
dtool/src/dtoolutil/stringDecoder.I

@@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
  *
  *
  */
  */
 INLINE StringUnicodeDecoder::
 INLINE StringUnicodeDecoder::
-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
 }
 }

+ 69 - 10
dtool/src/dtoolutil/stringDecoder.cxx

@@ -26,7 +26,7 @@ StringDecoder::
 /**
 /**
  * Returns the next character in sequence.
  * Returns the next character in sequence.
  */
  */
-int StringDecoder::
+char32_t StringDecoder::
 get_next_character() {
 get_next_character() {
   if (test_eof()) {
   if (test_eof()) {
     return -1;
     return -1;
@@ -57,19 +57,20 @@ get_notify_ptr() {
 
 
 /*
 /*
 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
-one, two, or three 8-bit bytes, depending on the value of the
+one, two, three or four 8-bit bytes, depending on the value of the
 character. The following table shows the format of such UTF-8 byte
 character. The following table shows the format of such UTF-8 byte
 sequences (where the "free bits" shown by x's in the table are
 sequences (where the "free bits" shown by x's in the table are
 combined in the order shown, and interpreted from most significant to
 combined in the order shown, and interpreted from most significant to
 least significant):
 least significant):
 
 
  Binary format of bytes in sequence:
  Binary format of bytes in sequence:
-                                        Number of    Maximum expressible
- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
+                                              Number of    Maximum expressible
+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:
 
 
- 0xxxxxxx                                  7           007F hex   (127)
- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
+ 0xxxxxxx                                         7          007F hex   (127)
+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
+ 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)
 
 
 The value of each individual byte indicates its UTF-8 function, as follows:
 The value of each individual byte indicates its UTF-8 function, as follows:
 
 
@@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
+ F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
 */
 */
 
 
 /**
 /**
  * Returns the next character in sequence.
  * Returns the next character in sequence.
  */
  */
-int StringUtf8Decoder::
+char32_t StringUtf8Decoder::
 get_next_character() {
 get_next_character() {
   unsigned int result;
   unsigned int result;
   while (!test_eof()) {
   while (!test_eof()) {
@@ -125,6 +127,35 @@ get_next_character() {
       unsigned int three = (unsigned char)_input[_p++];
       unsigned int three = (unsigned char)_input[_p++];
       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
       return result;
       return result;
+
+    } else if ((result & 0xf8) == 0xf0) {
+      // First byte of four.
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int two = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int three = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int four = (unsigned char)_input[_p++];
+      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
+      return result;
     }
     }
 
 
     // Otherwise--the high bit is set but it is not one of the introductory
     // Otherwise--the high bit is set but it is not one of the introductory
@@ -144,7 +175,7 @@ get_next_character() {
 /**
 /**
  * Returns the next character in sequence.
  * Returns the next character in sequence.
  */
  */
-int StringUnicodeDecoder::
+char32_t StringUtf16Decoder::
 get_next_character() {
 get_next_character() {
   if (test_eof()) {
   if (test_eof()) {
     return -1;
     return -1;
@@ -159,5 +190,33 @@ get_next_character() {
     return -1;
     return -1;
   }
   }
   unsigned int low = (unsigned char)_input[_p++];
   unsigned int low = (unsigned char)_input[_p++];
-  return ((high << 8) | low);
+  int ch = ((high << 8) | low);
+
+  /*
+  using std::swap;
+
+  if (ch == 0xfffe) {
+    // This is a byte-swapped byte-order-marker.  That means we need to swap
+    // the endianness of the rest of the stream.
+    char *data = (char *)_input.data();
+    for (size_t p = _p; p < _input.size() - 1; p += 2) {
+      std::swap(data[p], data[p + 1]);
+    }
+    ch = 0xfeff;
+  }
+  */
+
+  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
+    // This is a high surrogate.  Look for a subsequent low surrogate.
+    unsigned int high = (unsigned char)_input[_p];
+    unsigned int low = (unsigned char)_input[_p + 1];
+    int ch2 = ((high << 8) | low);
+    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+      // Yes, this is a low surrogate.
+      _p += 2;
+      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+    }
+  }
+  // No, this is just a regular character, or an unpaired surrogate.
+  return ch;
 }
 }

+ 9 - 6
dtool/src/dtoolutil/stringDecoder.h

@@ -26,7 +26,7 @@ public:
   INLINE StringDecoder(const std::string &input);
   INLINE StringDecoder(const std::string &input);
   virtual ~StringDecoder();
   virtual ~StringDecoder();
 
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
   INLINE bool is_eof();
   INLINE bool is_eof();
 
 
   static void set_notify_ptr(std::ostream *ptr);
   static void set_notify_ptr(std::ostream *ptr);
@@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 public:
 public:
   INLINE StringUtf8Decoder(const std::string &input);
   INLINE StringUtf8Decoder(const std::string &input);
 
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 };
 
 
 /**
 /**
  * This decoder extracts characters two at a time to get a plain wide
  * This decoder extracts characters two at a time to get a plain wide
- * character sequence.
+ * character sequence.  It supports surrogate pairs.
  */
  */
-class StringUnicodeDecoder : public StringDecoder {
+class StringUtf16Decoder : public StringDecoder {
 public:
 public:
-  INLINE StringUnicodeDecoder(const std::string &input);
+  INLINE StringUtf16Decoder(const std::string &input);
 
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 };
 
 
+// Deprecated alias of StringUtf16Encoder.
+typedef StringUtf16Decoder StringUnicodeDecoder;
+
 #include "stringDecoder.I"
 #include "stringDecoder.I"
 
 
 #endif
 #endif

+ 16 - 1
dtool/src/dtoolutil/textEncoder.I

@@ -169,8 +169,23 @@ append_text(const std::string &text) {
  * wide character, up to 16 bits in Unicode.
  * wide character, up to 16 bits in Unicode.
  */
  */
 INLINE void TextEncoder::
 INLINE void TextEncoder::
-append_unicode_char(int character) {
+append_unicode_char(char32_t character) {
+#if WCHAR_MAX >= 0x10FFFF
+  // wchar_t might be UTF-32.
   _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
   _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+#else
+  if ((character & ~0xffff) == 0) {
+    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+  } else {
+    // Encode as a surrogate pair.
+    uint32_t v = (uint32_t)character - 0x10000u;
+    wchar_t wstr[2] = {
+      (wchar_t)((v >> 10u) | 0xd800u),
+      (wchar_t)((v & 0x3ffu) | 0xdc00u),
+    };
+    _wtext = get_wtext() + std::wstring(wstr, 2);
+  }
+#endif
   _flags = (_flags | F_got_wtext) & ~F_got_text;
   _flags = (_flags | F_got_wtext) & ~F_got_text;
   text_changed();
   text_changed();
 }
 }

+ 65 - 19
dtool/src/dtoolutil/textEncoder.cxx

@@ -21,7 +21,7 @@ using std::ostream;
 using std::string;
 using std::string;
 using std::wstring;
 using std::wstring;
 
 
-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
 
 
 /**
 /**
  * Adjusts the text stored within the encoder to all uppercase letters
  * Adjusts the text stored within the encoder to all uppercase letters
@@ -109,11 +109,11 @@ is_wtext() const {
 }
 }
 
 
 /**
 /**
- * Encodes a single wide char into a one-, two-, or three-byte string,
- * according to the given encoding system.
+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
+ * string, according to the given encoding system.
  */
  */
 string TextEncoder::
 string TextEncoder::
-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
   switch (encoding) {
   switch (encoding) {
   case E_iso8859:
   case E_iso8859:
     if ((ch & ~0xff) == 0) {
     if ((ch & ~0xff) == 0) {
@@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
       return
       return
         string(1, (char)((ch >> 6) | 0xc0)) +
         string(1, (char)((ch >> 6) | 0xc0)) +
         string(1, (char)((ch & 0x3f) | 0x80));
         string(1, (char)((ch & 0x3f) | 0x80));
-    } else {
+    } else if ((ch & ~0xffff) == 0) {
       return
       return
         string(1, (char)((ch >> 12) | 0xe0)) +
         string(1, (char)((ch >> 12) | 0xe0)) +
         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
         string(1, (char)((ch & 0x3f) | 0x80));
         string(1, (char)((ch & 0x3f) | 0x80));
+    } else {
+      return
+        string(1, (char)((ch >> 18) | 0xf0)) +
+        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
+        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
+        string(1, (char)((ch & 0x3f) | 0x80));
     }
     }
 
 
-  case E_unicode:
-    return
-      string(1, (char)(ch >> 8)) +
-      string(1, (char)(ch & 0xff));
+  case E_utf16be:
+    if ((ch & ~0xffff) == 0) {
+      // Note that this passes through surrogates and BOMs unharmed.
+      return
+        string(1, (char)(ch >> 8)) +
+        string(1, (char)(ch & 0xff));
+    } else {
+      // Use a surrogate pair.
+      uint32_t v = (uint32_t)ch - 0x10000u;
+      uint16_t hi = (v >> 10u) | 0xd800u;
+      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
+      char encoded[4] = {
+        (char)(hi >> 8),
+        (char)(hi & 0xff),
+        (char)(lo >> 8),
+        (char)(lo & 0xff),
+      };
+      return string(encoded, 4);
+    }
   }
   }
 
 
   return "";
   return "";
@@ -169,8 +190,25 @@ string TextEncoder::
 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
   string result;
   string result;
 
 
-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
-    result += encode_wchar(*pi, encoding);
+  for (size_t i = 0; i < wtext.size(); ++i) {
+    wchar_t ch = wtext[i];
+
+    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
+#if WCHAR_MAX < 0x10FFFF
+    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
+      // This is a high surrogate.  Look for a subsequent low surrogate.
+      wchar_t ch2 = wtext[i + 1];
+      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+        // Yes, this is a low surrogate.
+        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+        result += encode_wchar(code_point, encoding);
+        i++;
+        continue;
+      }
+    }
+#endif
+
+    result += encode_wchar(ch, encoding);
   }
   }
 
 
   return result;
   return result;
@@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
       return decode_text_impl(decoder);
       return decode_text_impl(decoder);
     }
     }
 
 
-  case E_unicode:
+  case E_utf16be:
     {
     {
-      StringUnicodeDecoder decoder(text);
+      StringUtf16Decoder decoder(text);
       return decode_text_impl(decoder);
       return decode_text_impl(decoder);
     }
     }
 
 
@@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
   wstring result;
   wstring result;
   // bool expand_amp = get_expand_amp();
   // bool expand_amp = get_expand_amp();
 
 
-  wchar_t character = decoder.get_next_character();
+  char32_t character = decoder.get_next_character();
   while (!decoder.is_eof()) {
   while (!decoder.is_eof()) {
     /*
     /*
     if (character == '&' && expand_amp) {
     if (character == '&' && expand_amp) {
@@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
       character = expand_amp_sequence(decoder);
       character = expand_amp_sequence(decoder);
     }
     }
     */
     */
-    result += character;
+    if (character <= WCHAR_MAX) {
+      result += character;
+    } else {
+      // We need to encode this as a surrogate pair.
+      uint32_t v = (uint32_t)character - 0x10000u;
+      result += (wchar_t)((v >> 10u) | 0xd800u);
+      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
+    }
     character = decoder.get_next_character();
     character = decoder.get_next_character();
   }
   }
 
 
@@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
   case TextEncoder::E_utf8:
   case TextEncoder::E_utf8:
     return out << "utf8";
     return out << "utf8";
 
 
-  case TextEncoder::E_unicode:
-    return out << "unicode";
+  case TextEncoder::E_utf16be:
+    return out << "utf16be";
   };
   };
 
 
   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
     encoding = TextEncoder::E_iso8859;
     encoding = TextEncoder::E_iso8859;
   } else if (word == "utf8" || word == "utf-8") {
   } else if (word == "utf8" || word == "utf-8") {
     encoding = TextEncoder::E_utf8;
     encoding = TextEncoder::E_utf8;
-  } else if (word == "unicode") {
-    encoding = TextEncoder::E_unicode;
+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
+                                  word == "utf16-be" || word == "utf-16-be") {
+    encoding = TextEncoder::E_utf16be;
   } else {
   } else {
     ostream *notify_ptr = StringDecoder::get_notify_ptr();
     ostream *notify_ptr = StringDecoder::get_notify_ptr();
     if (notify_ptr != nullptr) {
     if (notify_ptr != nullptr) {

+ 7 - 4
dtool/src/dtoolutil/textEncoder.h

@@ -35,7 +35,10 @@ PUBLISHED:
   enum Encoding {
   enum Encoding {
     E_iso8859,
     E_iso8859,
     E_utf8,
     E_utf8,
-    E_unicode
+    E_utf16be,
+
+    // Deprecated alias for E_utf16be
+    E_unicode = E_utf16be,
   };
   };
 
 
   INLINE TextEncoder();
   INLINE TextEncoder();
@@ -70,7 +73,7 @@ PUBLISHED:
   INLINE std::string get_text(Encoding encoding) const;
   INLINE std::string get_text(Encoding encoding) const;
   INLINE void append_text(const std::string &text);
   INLINE void append_text(const std::string &text);
 #endif
 #endif
-  INLINE void append_unicode_char(int character);
+  INLINE void append_unicode_char(char32_t character);
   INLINE size_t get_num_chars() const;
   INLINE size_t get_num_chars() const;
   INLINE int get_unicode_char(size_t index) const;
   INLINE int get_unicode_char(size_t index) const;
   INLINE void set_unicode_char(size_t index, int character);
   INLINE void set_unicode_char(size_t index, int character);
@@ -103,13 +106,13 @@ PUBLISHED:
   bool is_wtext() const;
   bool is_wtext() const;
 
 
 #ifdef CPPPARSER
 #ifdef CPPPARSER
-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
   EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
   EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
   EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
   EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
   EXTEND INLINE PyObject *decode_text(PyObject *text) const;
   EXTEND INLINE PyObject *decode_text(PyObject *text) const;
   EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
   EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
 #else
 #else
-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
+  static std::string encode_wchar(char32_t ch, Encoding encoding);
   INLINE std::string encode_wtext(const std::wstring &wtext) const;
   INLINE std::string encode_wtext(const std::wstring &wtext) const;
   static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
   static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
   INLINE std::wstring decode_text(const std::string &text) const;
   INLINE std::wstring decode_text(const std::string &text) const;