Browse Source

Add new utf8 test-vector, update comments explaining utf8 decoding

Description:
Minor changes to help test and clarify the way utf8 strings are
decoded.  This originated from my misunderstanding of the fix for
issue #507.  The new test-vector uses two bytes to encode each
wide-char.

The utf8 format is described here:

  https://tools.ietf.org/html/rfc3629#section-3

Testing:

  $ make clean
  $ make CFLAGS="-DUSE_LTM -DLTM_DESC -I../libtommath" EXTRALIBS="../libtommath/libtommath.a" test
  $ ./test

You can confirm that the new utf8 test data is correct using python:

  >>> s="\xD7\xA9\xD7\x9C\xD7\x95\xD7\x9D"
  >>> s.decode("utf-8")
  u'\u05e9\u05dc\u05d5\u05dd'
James Muir 4 years ago
parent
commit
2092250088
2 changed files with 54 additions and 10 deletions
  1. 34 10
      src/pk/asn1/der/utf8/der_decode_utf8_string.c
  2. 20 0
      tests/der_test.c

+ 34 - 10
src/pk/asn1/der/utf8/der_decode_utf8_string.c

@@ -11,11 +11,11 @@
 #ifdef LTC_DER
 #ifdef LTC_DER
 
 
 /**
 /**
-  Store a UTF8 STRING
+  Decode a UTF8 STRING and recover an array of unicode characters.
   @param in      The DER encoded UTF8 STRING
   @param in      The DER encoded UTF8 STRING
   @param inlen   The size of the DER UTF8 STRING
   @param inlen   The size of the DER UTF8 STRING
-  @param out     [out] The array of utf8s stored (one per char)
-  @param outlen  [in/out] The number of utf8s stored
+  @param out     [out] The array of unicode characters (wchar_t*)
+  @param outlen  [in/out] The number of unicode characters in the array
   @return CRYPT_OK if successful
   @return CRYPT_OK if successful
 */
 */
 int der_decode_utf8_string(const unsigned char *in,  unsigned long inlen,
 int der_decode_utf8_string(const unsigned char *in,  unsigned long inlen,
@@ -51,23 +51,47 @@ int der_decode_utf8_string(const unsigned char *in,  unsigned long inlen,
       return CRYPT_INVALID_PACKET;
       return CRYPT_INVALID_PACKET;
    }
    }
 
 
-   /* proceed to decode */
+   /* proceed to recover unicode characters from utf8 data.
+      for reference see Section 3 of RFC 3629:
+
+        https://tools.ietf.org/html/rfc3629#section-3
+    */
    for (y = 0; x < inlen; ) {
    for (y = 0; x < inlen; ) {
-      /* get first byte */
+      /* read first byte */
       tmp = in[x++];
       tmp = in[x++];
 
 
-      /* count number of bytes */
+      /* a unicode character is recovered from a sequence of 1 to 4 utf8 bytes.
+         the form of those bytes must match a row in the following table:
+
+           0xxxxxxx
+           110xxxxx 10xxxxxx
+           1110xxxx 10xxxxxx 10xxxxxx
+           11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+         the number of leading ones in the first byte (0,2,3,4) determines the
+         number of remaining bytes to read (0,1,2,3)
+       */
+
+      /* determine z, the number of leading ones.
+         this is done by left-shifting tmp, which clears the ms-bits */
       for (z = 0; (tmp & 0x80) && (z <= 4); z++, tmp = (tmp << 1) & 0xFF);
       for (z = 0; (tmp & 0x80) && (z <= 4); z++, tmp = (tmp << 1) & 0xFF);
 
 
-      if (z == 1 || z > 4 || (x + (z - 1) > inlen)) {
+      /* z should be in {0,2,3,4} */
+      if (z == 1 || z > 4) {
          return CRYPT_INVALID_PACKET;
          return CRYPT_INVALID_PACKET;
       }
       }
 
 
-      /* decode, grab upper bits */
+      /* right-shift tmp to restore least-sig bits */
       tmp >>= z;
       tmp >>= z;
 
 
-      /* grab remaining bytes */
-      if (z > 1) { --z; }
+      /* now update z so it equals the number of additional bytes to read */
+      if (z > 0) { --z; }
+
+      if (x + z > inlen) {
+         return CRYPT_INVALID_PACKET;
+      }
+
+      /* read remaining bytes */
       while (z-- != 0) {
       while (z-- != 0) {
          if ((in[x] & 0xC0) != 0x80) {
          if ((in[x] & 0xC0) != 0x80) {
             return CRYPT_INVALID_PACKET;
             return CRYPT_INVALID_PACKET;

+ 20 - 0
tests/der_test.c

@@ -1603,6 +1603,8 @@ int der_test(void)
    static const unsigned char utf8_1_der[] = { 0x0C, 0x07, 0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E };
    static const unsigned char utf8_1_der[] = { 0x0C, 0x07, 0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E };
    static const wchar_t utf8_2[]           = { 0xD55C, 0xAD6D, 0xC5B4 };
    static const wchar_t utf8_2[]           = { 0xD55C, 0xAD6D, 0xC5B4 };
    static const unsigned char utf8_2_der[] = { 0x0C, 0x09, 0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4 };
    static const unsigned char utf8_2_der[] = { 0x0C, 0x09, 0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4 };
+   static const wchar_t utf8_3[]           = { 0x05E9, 0x05DC, 0x05D5, 0x05DD };
+   static const unsigned char utf8_3_der[] = { 0x0C, 0x08, 0xD7, 0xA9, 0xD7, 0x9C, 0xD7, 0x95, 0xD7, 0x9D };
 
 
    unsigned char utf8_buf[32];
    unsigned char utf8_buf[32];
    wchar_t utf8_out[32];
    wchar_t utf8_out[32];
@@ -1961,6 +1963,24 @@ tmp_time.off_hh);
         return 1;
         return 1;
      }
      }
 
 
+     /* encode it */
+     x = sizeof(utf8_buf);
+     DO(der_encode_utf8_string(utf8_3, sizeof(utf8_3) / sizeof(utf8_3[0]), utf8_buf, &x));
+     if (x != sizeof(utf8_3_der) || memcmp(utf8_buf, utf8_3_der, x)) {
+        fprintf(stderr, "DER UTF8_3 encoded to %lu bytes\n", x);
+        for (y = 0; y < x; y++) fprintf(stderr, "%02x ", (unsigned)utf8_buf[y]);
+        fprintf(stderr, "\n");
+        return 1;
+     }
+     /* decode it */
+     y = sizeof(utf8_out) / sizeof(utf8_out[0]);
+     DO(der_decode_utf8_string(utf8_buf, x, utf8_out, &y));
+     if (y != (sizeof(utf8_3) / sizeof(utf8_3[0])) || memcmp(utf8_3, utf8_out, y * sizeof(wchar_t))) {
+        fprintf(stderr, "DER UTF8_3 decoded to %lu wchar_t\n", y);
+        for (x = 0; x < y; x++) fprintf(stderr, "%04lx ", (unsigned long)utf8_out[x]);
+        fprintf(stderr, "\n");
+        return 1;
+     }
 
 
    der_set_test();
    der_set_test();
    der_flexi_test();
    der_flexi_test();