Browse Source

Changes in the validation of UTF-8

All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).

By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
Roberto Ierusalimschy 6 years ago
parent
commit
1e0c73d5b6
6 changed files with 164 additions and 72 deletions
  1. 1 1
      llex.c
  2. 3 3
      lobject.c
  3. 49 27
      lutf8lib.c
  4. 39 4
      manual/manual.of
  5. 12 5
      testes/literals.lua
  6. 60 32
      testes/utf8.lua

+ 1 - 1
llex.c

@@ -335,7 +335,7 @@ static unsigned long readutf8esc (LexState *ls) {
   while ((save_and_next(ls), lisxdigit(ls->current))) {
   while ((save_and_next(ls), lisxdigit(ls->current))) {
     i++;
     i++;
     r = (r << 4) + luaO_hexavalue(ls->current);
     r = (r << 4) + luaO_hexavalue(ls->current);
-    esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
+    esccheck(ls, r <= 0x7FFFFFFFu, "UTF-8 value too large");
   }
   }
   esccheck(ls, ls->current == '}', "missing '}'");
   esccheck(ls, ls->current == '}', "missing '}'");
   next(ls);  /* skip '}' */
   next(ls);  /* skip '}' */

+ 3 - 3
lobject.c

@@ -343,7 +343,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
 
 
 int luaO_utf8esc (char *buff, unsigned long x) {
 int luaO_utf8esc (char *buff, unsigned long x) {
   int n = 1;  /* number of bytes put in buffer (backwards) */
   int n = 1;  /* number of bytes put in buffer (backwards) */
-  lua_assert(x <= 0x10FFFF);
+  lua_assert(x <= 0x7FFFFFFFu);
   if (x < 0x80)  /* ascii? */
   if (x < 0x80)  /* ascii? */
     buff[UTF8BUFFSZ - 1] = cast_char(x);
     buff[UTF8BUFFSZ - 1] = cast_char(x);
   else {  /* need continuation bytes */
   else {  /* need continuation bytes */
@@ -435,9 +435,9 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
         pushstr(L, buff, l);
         pushstr(L, buff, l);
         break;
         break;
       }
       }
-      case 'U': {  /* an 'int' as a UTF-8 sequence */
+      case 'U': {  /* a 'long' as a UTF-8 sequence */
         char buff[UTF8BUFFSZ];
         char buff[UTF8BUFFSZ];
-        int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long)));
+        int l = luaO_utf8esc(buff, va_arg(argp, long));
         pushstr(L, buff + UTF8BUFFSZ - l, l);
         pushstr(L, buff + UTF8BUFFSZ - l, l);
         break;
         break;
       }
       }

+ 49 - 27
lutf8lib.c

@@ -21,12 +21,14 @@
 #include "lualib.h"
 #include "lualib.h"
 
 
 
 
-#define MAXUNICODE	0x10FFFF
+#define MAXUNICODE	0x10FFFFu
+
+#define MAXUTF		0x7FFFFFFFu
 
 
 /*
 /*
-** Integer type for decoded UTF-8 values; MAXUNICODE needs 21 bits.
+** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
 */
 */
-#if LUAI_BITSINT >= 21
+#if LUAI_BITSINT >= 31
 typedef	unsigned int utfint;
 typedef	unsigned int utfint;
 #else
 #else
 typedef unsigned long utfint;
 typedef unsigned long utfint;
@@ -46,38 +48,46 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
 
 
 
 
 /*
 /*
-** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+** Decode one UTF-8 sequence, returning NULL if byte sequence is
+** invalid.  The array 'limits' stores the minimum value for each
+** sequence length, to check for overlong representations. Its first
+** entry forces an error for non-ascii bytes with no continuation
+** bytes (count == 0).
 */
 */
-static const char *utf8_decode (const char *o, utfint *val) {
-  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
-  const unsigned char *s = (const unsigned char *)o;
-  unsigned int c = s[0];
+static const char *utf8_decode (const char *s, utfint *val, int strict) {
+  static const utfint limits[] =
+        {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
+  unsigned int c = (unsigned char)s[0];
   utfint res = 0;  /* final result */
   utfint res = 0;  /* final result */
   if (c < 0x80)  /* ascii? */
   if (c < 0x80)  /* ascii? */
     res = c;
     res = c;
   else {
   else {
     int count = 0;  /* to count number of continuation bytes */
     int count = 0;  /* to count number of continuation bytes */
-    while (c & 0x40) {  /* still have continuation bytes? */
-      int cc = s[++count];  /* read next byte */
+    for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
+      unsigned int cc = (unsigned char)s[++count];  /* read next byte */
       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
         return NULL;  /* invalid byte sequence */
         return NULL;  /* invalid byte sequence */
       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
-      c <<= 1;  /* to test next bit */
     }
     }
     res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
     res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
-    if (count > 3 || res > MAXUNICODE || res <= limits[count])
+    if (count > 5 || res > MAXUTF || res < limits[count])
       return NULL;  /* invalid byte sequence */
       return NULL;  /* invalid byte sequence */
     s += count;  /* skip continuation bytes read */
     s += count;  /* skip continuation bytes read */
   }
   }
+  if (strict) {
+    /* check for invalid code points; too large or surrogates */
+    if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
+      return NULL;
+  }
   if (val) *val = res;
   if (val) *val = res;
-  return (const char *)s + 1;  /* +1 to include first byte */
+  return s + 1;  /* +1 to include first byte */
 }
 }
 
 
 
 
 /*
 /*
-** utf8len(s [, i [, j]]) --> number of characters that start in the
-** range [i,j], or nil + current position if 's' is not well formed in
-** that interval
+** utf8len(s [, i [, j [, nonstrict]]]) --> number of characters that
+** start in the range [i,j], or nil + current position if 's' is not
+** well formed in that interval
 */
 */
 static int utflen (lua_State *L) {
 static int utflen (lua_State *L) {
   lua_Integer n = 0;  /* counter for the number of characters */
   lua_Integer n = 0;  /* counter for the number of characters */
@@ -85,12 +95,13 @@ static int utflen (lua_State *L) {
   const char *s = luaL_checklstring(L, 1, &len);
   const char *s = luaL_checklstring(L, 1, &len);
   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
+  int nonstrict = lua_toboolean(L, 4);
   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
                    "initial position out of string");
                    "initial position out of string");
   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
                    "final position out of string");
                    "final position out of string");
   while (posi <= posj) {
   while (posi <= posj) {
-    const char *s1 = utf8_decode(s + posi, NULL);
+    const char *s1 = utf8_decode(s + posi, NULL, !nonstrict);
     if (s1 == NULL) {  /* conversion error? */
     if (s1 == NULL) {  /* conversion error? */
       lua_pushnil(L);  /* return nil ... */
       lua_pushnil(L);  /* return nil ... */
       lua_pushinteger(L, posi + 1);  /* ... and current position */
       lua_pushinteger(L, posi + 1);  /* ... and current position */
@@ -105,14 +116,15 @@ static int utflen (lua_State *L) {
 
 
 
 
 /*
 /*
-** codepoint(s, [i, [j]])  -> returns codepoints for all characters
-** that start in the range [i,j]
+** codepoint(s, [i, [j [, nonstrict]]]) -> returns codepoints for all
+** characters that start in the range [i,j]
 */
 */
 static int codepoint (lua_State *L) {
 static int codepoint (lua_State *L) {
   size_t len;
   size_t len;
   const char *s = luaL_checklstring(L, 1, &len);
   const char *s = luaL_checklstring(L, 1, &len);
   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
+  int nonstrict = lua_toboolean(L, 4);
   int n;
   int n;
   const char *se;
   const char *se;
   luaL_argcheck(L, posi >= 1, 2, "out of range");
   luaL_argcheck(L, posi >= 1, 2, "out of range");
@@ -126,7 +138,7 @@ static int codepoint (lua_State *L) {
   se = s + pose;  /* string end */
   se = s + pose;  /* string end */
   for (s += posi - 1; s < se;) {
   for (s += posi - 1; s < se;) {
     utfint code;
     utfint code;
-    s = utf8_decode(s, &code);
+    s = utf8_decode(s, &code, !nonstrict);
     if (s == NULL)
     if (s == NULL)
       return luaL_error(L, "invalid UTF-8 code");
       return luaL_error(L, "invalid UTF-8 code");
     lua_pushinteger(L, code);
     lua_pushinteger(L, code);
@@ -137,8 +149,8 @@ static int codepoint (lua_State *L) {
 
 
 
 
 static void pushutfchar (lua_State *L, int arg) {
 static void pushutfchar (lua_State *L, int arg) {
-  lua_Integer code = luaL_checkinteger(L, arg);
-  luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
+  lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
+  luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
   lua_pushfstring(L, "%U", (long)code);
   lua_pushfstring(L, "%U", (long)code);
 }
 }
 
 
@@ -209,7 +221,7 @@ static int byteoffset (lua_State *L) {
 }
 }
 
 
 
 
-static int iter_aux (lua_State *L) {
+static int iter_aux (lua_State *L, int strict) {
   size_t len;
   size_t len;
   const char *s = luaL_checklstring(L, 1, &len);
   const char *s = luaL_checklstring(L, 1, &len);
   lua_Integer n = lua_tointeger(L, 2) - 1;
   lua_Integer n = lua_tointeger(L, 2) - 1;
@@ -223,8 +235,8 @@ static int iter_aux (lua_State *L) {
     return 0;  /* no more codepoints */
     return 0;  /* no more codepoints */
   else {
   else {
     utfint code;
     utfint code;
-    const char *next = utf8_decode(s + n, &code);
-    if (next == NULL || iscont(next))
+    const char *next = utf8_decode(s + n, &code, strict);
+    if (next == NULL)
       return luaL_error(L, "invalid UTF-8 code");
       return luaL_error(L, "invalid UTF-8 code");
     lua_pushinteger(L, n + 1);
     lua_pushinteger(L, n + 1);
     lua_pushinteger(L, code);
     lua_pushinteger(L, code);
@@ -233,9 +245,19 @@ static int iter_aux (lua_State *L) {
 }
 }
 
 
 
 
+static int iter_auxstrict (lua_State *L) {
+  return iter_aux(L, 1);
+}
+
+static int iter_auxnostrict (lua_State *L) {
+  return iter_aux(L, 0);
+}
+
+
 static int iter_codes (lua_State *L) {
 static int iter_codes (lua_State *L) {
+  int nonstrict = lua_toboolean(L, 2);
   luaL_checkstring(L, 1);
   luaL_checkstring(L, 1);
-  lua_pushcfunction(L, iter_aux);
+  lua_pushcfunction(L, nonstrict ? iter_auxnostrict : iter_auxstrict);
   lua_pushvalue(L, 1);
   lua_pushvalue(L, 1);
   lua_pushinteger(L, 0);
   lua_pushinteger(L, 0);
   return 3;
   return 3;
@@ -243,7 +265,7 @@ static int iter_codes (lua_State *L) {
 
 
 
 
 /* pattern to match a single UTF-8 character */
 /* pattern to match a single UTF-8 character */
-#define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
+#define UTF8PATT	"[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
 
 
 
 
 static const luaL_Reg funcs[] = {
 static const luaL_Reg funcs[] = {

+ 39 - 4
manual/manual.of

@@ -1004,6 +1004,8 @@ the escape sequence @T{\u{@rep{XXX}}}
 (note the mandatory enclosing brackets),
 (note the mandatory enclosing brackets),
 where @rep{XXX} is a sequence of one or more hexadecimal digits
 where @rep{XXX} is a sequence of one or more hexadecimal digits
 representing the character code point.
 representing the character code point.
+This code point can be any value smaller than @M{2@sp{31}}.
+(Lua uses the original UTF-8 specification here.)
 
 
 Literal strings can also be defined using a long format
 Literal strings can also be defined using a long format
 enclosed by @def{long brackets}.
 enclosed by @def{long brackets}.
@@ -6899,6 +6901,7 @@ x = string.gsub("$name-$version.tar.gz", "%$(%w+)", t)
 }
 }
 
 
 @LibEntry{string.len (s)|
 @LibEntry{string.len (s)|
+
 Receives a string and returns its length.
 Receives a string and returns its length.
 The empty string @T{""} has length 0.
 The empty string @T{""} has length 0.
 Embedded zeros are counted,
 Embedded zeros are counted,
@@ -6907,6 +6910,7 @@ so @T{"a\000bc\000"} has length 5.
 }
 }
 
 
 @LibEntry{string.lower (s)|
 @LibEntry{string.lower (s)|
+
 Receives a string and returns a copy of this string with all
 Receives a string and returns a copy of this string with all
 uppercase letters changed to lowercase.
 uppercase letters changed to lowercase.
 All other characters are left unchanged.
 All other characters are left unchanged.
@@ -6915,6 +6919,7 @@ The definition of what an uppercase letter is depends on the current locale.
 }
 }
 
 
 @LibEntry{string.match (s, pattern [, init])|
 @LibEntry{string.match (s, pattern [, init])|
+
 Looks for the first @emph{match} of
 Looks for the first @emph{match} of
 @id{pattern} @see{pm} in the string @id{s}.
 @id{pattern} @see{pm} in the string @id{s}.
 If it finds one, then @id{match} returns
 If it finds one, then @id{match} returns
@@ -6946,6 +6951,7 @@ The format string cannot have the variable-length options
 }
 }
 
 
 @LibEntry{string.rep (s, n [, sep])|
 @LibEntry{string.rep (s, n [, sep])|
+
 Returns a string that is the concatenation of @id{n} copies of
 Returns a string that is the concatenation of @id{n} copies of
 the string @id{s} separated by the string @id{sep}.
 the string @id{s} separated by the string @id{sep}.
 The default value for @id{sep} is the empty string
 The default value for @id{sep} is the empty string
@@ -6958,11 +6964,13 @@ with a single call to this function.)
 }
 }
 
 
 @LibEntry{string.reverse (s)|
 @LibEntry{string.reverse (s)|
+
 Returns a string that is the string @id{s} reversed.
 Returns a string that is the string @id{s} reversed.
 
 
 }
 }
 
 
 @LibEntry{string.sub (s, i [, j])|
 @LibEntry{string.sub (s, i [, j])|
+
 Returns the substring of @id{s} that
 Returns the substring of @id{s} that
 starts at @id{i}  and continues until @id{j};
 starts at @id{i}  and continues until @id{j};
 @id{i} and @id{j} can be negative.
 @id{i} and @id{j} can be negative.
@@ -6998,6 +7006,7 @@ this function also returns the index of the first unread byte in @id{s}.
 }
 }
 
 
 @LibEntry{string.upper (s)|
 @LibEntry{string.upper (s)|
+
 Receives a string and returns a copy of this string with all
 Receives a string and returns a copy of this string with all
 lowercase letters changed to uppercase.
 lowercase letters changed to uppercase.
 All other characters are left unchanged.
 All other characters are left unchanged.
@@ -7318,8 +7327,24 @@ or one plus the length of the subject string.
 As in the string library,
 As in the string library,
 negative indices count from the end of the string.
 negative indices count from the end of the string.
 
 
+Functions that create byte sequences
+accept all values up to @T{0x7FFFFFFF},
+as defined in the original UTF-8 specification;
+that implies byte sequences of up to six bytes.
+
+Functions that interpret byte sequences only accept
+valid sequences (well formed and not overlong).
+By default, they only accept byte sequences
+that result in valid Unicode code points,
+rejecting values larger than @T{10FFFF} and surrogates.
+A boolean argument @id{nonstrict}, when available,
+lifts these checks,
+so that all values up to @T{0x7FFFFFFF} are accepted.
+(Not well formed and overlong sequences are still rejected.)
+
 
 
 @LibEntry{utf8.char (@Cdots)|
 @LibEntry{utf8.char (@Cdots)|
+
 Receives zero or more integers,
 Receives zero or more integers,
 converts each one to its corresponding UTF-8 byte sequence
 converts each one to its corresponding UTF-8 byte sequence
 and returns a string with the concatenation of all these sequences.
 and returns a string with the concatenation of all these sequences.
@@ -7327,14 +7352,15 @@ and returns a string with the concatenation of all these sequences.
 }
 }
 
 
 @LibEntry{utf8.charpattern|
 @LibEntry{utf8.charpattern|
-The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xF4][\x80-\xBF]*}
+
+The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xFD][\x80-\xBF]*}
 @see{pm},
 @see{pm},
 which matches exactly one UTF-8 byte sequence,
 which matches exactly one UTF-8 byte sequence,
 assuming that the subject is a valid UTF-8 string.
 assuming that the subject is a valid UTF-8 string.
 
 
 }
 }
 
 
-@LibEntry{utf8.codes (s)|
+@LibEntry{utf8.codes (s [, nonstrict])|
 
 
 Returns values so that the construction
 Returns values so that the construction
 @verbatim{
 @verbatim{
@@ -7347,7 +7373,8 @@ It raises an error if it meets any invalid byte sequence.
 
 
 }
 }
 
 
-@LibEntry{utf8.codepoint (s [, i [, j]])|
+@LibEntry{utf8.codepoint (s [, i [, j [, nonstrict]]])|
+
 Returns the codepoints (as integers) from all characters in @id{s}
 Returns the codepoints (as integers) from all characters in @id{s}
 that start between byte position @id{i} and @id{j} (both included).
 that start between byte position @id{i} and @id{j} (both included).
 The default for @id{i} is 1 and for @id{j} is @id{i}.
 The default for @id{i} is 1 and for @id{j} is @id{i}.
@@ -7355,7 +7382,8 @@ It raises an error if it meets any invalid byte sequence.
 
 
 }
 }
 
 
-@LibEntry{utf8.len (s [, i [, j]])|
+@LibEntry{utf8.len (s [, i [, j [, nonstrict]]])|
+
 Returns the number of UTF-8 characters in string @id{s}
 Returns the number of UTF-8 characters in string @id{s}
 that start between positions @id{i} and @id{j} (both inclusive).
 that start between positions @id{i} and @id{j} (both inclusive).
 The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
 The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
@@ -7365,6 +7393,7 @@ returns a false value plus the position of the first invalid byte.
 }
 }
 
 
 @LibEntry{utf8.offset (s, n [, i])|
 @LibEntry{utf8.offset (s, n [, i])|
+
 Returns the position (in bytes) where the encoding of the
 Returns the position (in bytes) where the encoding of the
 @id{n}-th character of @id{s}
 @id{n}-th character of @id{s}
 (counting from position @id{i}) starts.
 (counting from position @id{i}) starts.
@@ -8755,6 +8784,12 @@ You can enclose the call in parentheses if you need to
 discard these extra results.
 discard these extra results.
 }
 }
 
 
+@item{
+By default, the decoding functions in the @Lid{utf8} library
+do not accept surrogates as valid code points.
+An extra parameter in these functions makes them more permissive.
+}
+
 }
 }
 
 
 }
 }

+ 12 - 5
testes/literals.lua

@@ -56,16 +56,23 @@ assert("abc\z
 assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
 assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
 
 
 -- limits for 1-byte sequences
 -- limits for 1-byte sequences
-assert("\u{0}\u{7F}" == "\x00\z\x7F")
+assert("\u{0}\u{7F}" == "\x00\x7F")
 
 
 -- limits for 2-byte sequences
 -- limits for 2-byte sequences
-assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF")
+assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
 
 
 -- limits for 3-byte sequences
 -- limits for 3-byte sequences
-assert("\u{800}\u{FFFF}" ==   "\xE0\xA0\x80\z\xEF\xBF\xBF")
+assert("\u{800}\u{FFFF}" ==   "\xE0\xA0\x80\xEF\xBF\xBF")
 
 
 -- limits for 4-byte sequences
 -- limits for 4-byte sequences
-assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF")
+assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
+
+-- limits for 5-byte sequences
+assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
+
+-- limits for 6-byte sequences
+assert("\u{4000000}\u{7FFFFFFF}" ==
+       "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
 
 
 
 
 -- Error in escape sequences
 -- Error in escape sequences
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
 lexerror([["   \256"]], [[\256"]])
 lexerror([["   \256"]], [[\256"]])
 
 
 -- errors in UTF-8 sequences
 -- errors in UTF-8 sequences
-lexerror([["abc\u{110000}"]], [[abc\u{110000]])   -- too large
+lexerror([["abc\u{100000000}"]], [[abc\u{100000000]])   -- too large
 lexerror([["abc\u11r"]], [[abc\u1]])    -- missing '{'
 lexerror([["abc\u11r"]], [[abc\u1]])    -- missing '{'
 lexerror([["abc\u"]], [[abc\u"]])    -- missing '{'
 lexerror([["abc\u"]], [[abc\u"]])    -- missing '{'
 lexerror([["abc\u{11r"]], [[abc\u{11r]])    -- missing '}'
 lexerror([["abc\u{11r"]], [[abc\u{11r]])    -- missing '}'

+ 60 - 32
testes/utf8.lua

@@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
 
 
 -- 't' is the list of codepoints of 's'
 -- 't' is the list of codepoints of 's'
 local function checksyntax (s, t)
 local function checksyntax (s, t)
+  -- creates a string "return '\u{t[1]}...\u{t[n]}'"
   local ts = {"return '"}
   local ts = {"return '"}
   for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
   for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
   ts[#t + 2] = "'"
   ts[#t + 2] = "'"
   ts = table.concat(ts)
   ts = table.concat(ts)
+  -- its execution should result in 's'
   assert(assert(load(ts))() == s)
   assert(assert(load(ts))() == s)
 end
 end
 
 
 assert(utf8.offset("alo", 5) == nil)
 assert(utf8.offset("alo", 5) == nil)
 assert(utf8.offset("alo", -4) == nil)
 assert(utf8.offset("alo", -4) == nil)
 
 
--- 't' is the list of codepoints of 's'
-local function check (s, t)
-  local l = utf8.len(s) 
+-- 'check' makes several tests over the validity of string 's'.
+-- 't' is the list of codepoints of 's'.
+local function check (s, t, nonstrict)
+  local l = utf8.len(s, 1, -1, nonstrict)
   assert(#t == l and len(s) == l)
   assert(#t == l and len(s) == l)
-  assert(utf8.char(table.unpack(t)) == s)
+  assert(utf8.char(table.unpack(t)) == s)   -- 't' and 's' are equivalent
 
 
   assert(utf8.offset(s, 0) == 1)
   assert(utf8.offset(s, 0) == 1)
 
 
   checksyntax(s, t)
   checksyntax(s, t)
 
 
-  local t1 = {utf8.codepoint(s, 1, -1)}
+  -- creates new table with all codepoints of 's'
+  local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
   assert(#t == #t1)
   assert(#t == #t1)
-  for i = 1, #t do assert(t[i] == t1[i]) end
+  for i = 1, #t do assert(t[i] == t1[i]) end   -- 't' is equal to 't1'
 
 
-  for i = 1, l do
+  for i = 1, l do   -- for all codepoints
     local pi = utf8.offset(s, i)        -- position of i-th char
     local pi = utf8.offset(s, i)        -- position of i-th char
     local pi1 = utf8.offset(s, 2, pi)   -- position of next char
     local pi1 = utf8.offset(s, 2, pi)   -- position of next char
     assert(string.find(string.sub(s, pi, pi1 - 1), justone))
     assert(string.find(string.sub(s, pi, pi1 - 1), justone))
     assert(utf8.offset(s, -1, pi1) == pi)
     assert(utf8.offset(s, -1, pi1) == pi)
     assert(utf8.offset(s, i - l - 1) == pi)
     assert(utf8.offset(s, i - l - 1) == pi)
-    assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
-    for j = pi, pi1 - 1 do 
+    assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
+    for j = pi, pi1 - 1 do
       assert(utf8.offset(s, 0, j) == pi)
       assert(utf8.offset(s, 0, j) == pi)
     end
     end
     for j = pi + 1, pi1 - 1 do
     for j = pi + 1, pi1 - 1 do
       assert(not utf8.len(s, j))
       assert(not utf8.len(s, j))
     end
     end
-   assert(utf8.len(s, pi, pi) == 1)
-   assert(utf8.len(s, pi, pi1 - 1) == 1)
-   assert(utf8.len(s, pi) == l - i + 1)
-   assert(utf8.len(s, pi1) == l - i)
-   assert(utf8.len(s, 1, pi) == i)
+   assert(utf8.len(s, pi, pi, nonstrict) == 1)
+   assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
+   assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
+   assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
+   assert(utf8.len(s, 1, pi, -1, nonstrict) == i)
   end
   end
 
 
   local i = 0
   local i = 0
-  for p, c in utf8.codes(s) do
+  for p, c in utf8.codes(s, nonstrict) do
     i = i + 1
     i = i + 1
     assert(c == t[i] and p == utf8.offset(s, i))
     assert(c == t[i] and p == utf8.offset(s, i))
-    assert(utf8.codepoint(s, p) == c)
-  end
-  assert(i == #t)
-
-  i = 0
-  for p, c in utf8.codes(s) do
-    i = i + 1
-    assert(c == t[i] and p == utf8.offset(s, i)) 
+    assert(utf8.codepoint(s, p, p, nonstrict) == c)
   end
   end
   assert(i == #t)
   assert(i == #t)
 
 
@@ -105,13 +102,17 @@ do    -- error indication in utf8.len
   check("\xF4\x9F\xBF\xBF", 1)
   check("\xF4\x9F\xBF\xBF", 1)
 end
 end
 
 
--- error in utf8.codes
-checkerror("invalid UTF%-8 code",
-  function ()
-    local s = "ab\xff"
-    for c in utf8.codes(s) do assert(c) end
-  end)
-
+-- errors in utf8.codes
+do
+  local function errorcodes (s)
+    checkerror("invalid UTF%-8 code",
+      function ()
+        for c in utf8.codes(s) do assert(c) end
+      end)
+  end
+  errorcodes("ab\xff")
+  errorcodes("\u{110000}")
+end
 
 
 -- error in initial position for offset
 -- error in initial position for offset
 checkerror("position out of range", utf8.offset, "abc", 1, 5)
 checkerror("position out of range", utf8.offset, "abc", 1, 5)
@@ -141,14 +142,22 @@ do
   assert(#t == 0)
   assert(#t == 0)
   checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
   checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
   checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
   checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
+  -- surrogates
+  assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
+  assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
+  assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
+  assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
+  assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
 end
 end
 
 
 assert(utf8.char() == "")
 assert(utf8.char() == "")
-assert(utf8.char(97, 98, 99) == "abc")
+assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
 
 
 assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
 assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
+assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
 
 
-checkerror("value out of range", utf8.char, 0x10FFFF + 1)
+checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
+checkerror("value out of range", utf8.char, -1)
 
 
 local function invalid (s)
 local function invalid (s)
   checkerror("invalid UTF%-8 code", utf8.codepoint, s)
   checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@@ -158,6 +167,10 @@ end
 -- UTF-8 representation for 0x11ffff (value out of valid range)
 -- UTF-8 representation for 0x11ffff (value out of valid range)
 invalid("\xF4\x9F\xBF\xBF")
 invalid("\xF4\x9F\xBF\xBF")
 
 
+-- surrogates
+invalid("\u{D800}")
+invalid("\u{DFFF}")
+
 -- overlong sequences
 -- overlong sequences
 invalid("\xC0\x80")          -- zero
 invalid("\xC0\x80")          -- zero
 invalid("\xC1\xBF")          -- 0x7F (should be coded in 1 byte)
 invalid("\xC1\xBF")          -- 0x7F (should be coded in 1 byte)
@@ -183,6 +196,21 @@ s = "\0 \x7F\z
 s = string.gsub(s, " ", "")
 s = string.gsub(s, " ", "")
 check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
 check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
 
 
+do
+  -- original UTF-8 values
+  local s = "\u{4000000}\u{7FFFFFFF}"
+  assert(#s == 12)
+  check(s, {0x4000000, 0x7FFFFFFF}, true)
+
+  s = "\u{200000}\u{3FFFFFF}"
+  assert(#s == 10)
+  check(s, {0x200000, 0x3FFFFFF}, true)
+
+  s = "\u{10000}\u{1fffff}"
+  assert(#s == 8)
+  check(s, {0x10000, 0x1FFFFF}, true)
+end
+
 x = "日本語a-4\0éó"
 x = "日本語a-4\0éó"
 check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
 check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})