Browse Source

display warnings for unexpected characters in utf-8 string

David Rose 23 years ago
parent
commit
50682a4409
1 changed files with 46 additions and 25 deletions
  1. 46 25
      panda/src/text/stringDecoder.cxx

+ 46 - 25
panda/src/text/stringDecoder.cxx

@@ -17,6 +17,7 @@
 ////////////////////////////////////////////////////////////////////
 
 #include "stringDecoder.h"
+#include "config_text.h"
 
 ////////////////////////////////////////////////////////////////////
 //     Function: StringDecoder::Destructor
@@ -71,33 +72,51 @@ The value of each individual byte indicates its UTF-8 function, as follows:
 ////////////////////////////////////////////////////////////////////
 int StringUtf8Decoder::
 get_next_character() {
-  if (test_eof()) {
-    return -1;
-  }
+  unsigned int result;
+  while (!test_eof()) {
+    result = (unsigned char)_input[_p++];
+    if ((result & 0x80) == 0) {
+      // A 7-bit ascii value in one byte.
+      return result;
 
-  unsigned int result = (unsigned char)_input[_p++];
-  if ((result & 0xe0) == 0xc0) {
-    // First byte of two.
-    unsigned int two = 0;
-    if (!test_eof()) {
+    } if ((result & 0xe0) == 0xc0) {
+      // First byte of two.
+      unsigned int two = 0;
+      if (test_eof()) {
+        text_cat.warning()
+          << "utf-8 encoded string ends abruptly.\n";
+        return -1;
+      }
       two = (unsigned char)_input[_p++];
+      result = ((result & 0x1f) << 6) | (two & 0x3f);
+      return result;
+      
+    } else if ((result & 0xf0) == 0xe0) {
+      // First byte of three.
+      if (test_eof()) {
+        text_cat.warning()
+          << "utf-8 encoded string ends abruptly.\n";
+        return -1;
+      }
+      unsigned int two = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        text_cat.warning()
+          << "utf-8 encoded string ends abruptly.\n";
+        return -1;
+      }
+      unsigned int three = (unsigned char)_input[_p++];
+      result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+      return result;
     }
-    result = ((result & 0x1f) << 6) | (two & 0x3f);
 
-  } else if ((result & 0xf0) == 0xe0) {
-    // First byte of three.
-    unsigned int two = 0;
-    unsigned int three = 0;
-    if (!test_eof()) {
-      two = (unsigned char)_input[_p++];
-    }
-    if (!test_eof()) {
-      three = (unsigned char)_input[_p++];
-    }
-    result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
-  } 
+    // Otherwise--the high bit is set but it is not one of the
+    // introductory utf-8 bytes--we have an error.
+    text_cat.warning()
+      << "Non utf-8 byte in string: 0x" << hex << result << dec << "\n";
+  }
 
-  return result;
+  // End of string reached.
+  return -1;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -112,9 +131,11 @@ get_next_character() {
   }
 
   unsigned int high = (unsigned char)_input[_p++];
-  unsigned int low = 0;
-  if (!test_eof()) {
-    low = (unsigned char)_input[_p++];
+  if (test_eof()) {
+    text_cat.warning()
+      << "Unicode-encoded string has odd number of bytes.\n";
+    return -1;
   }
+  unsigned int low = (unsigned char)_input[_p++];
   return ((high << 8) | low);
 }