Переглянути джерело

Correction in utf8.offset

Wrong utf-8 character may have no continuation bytes.
Roberto Ierusalimschy 1 тиждень тому
батько
коміт
ccb8b307f1
2 змінених файлів з 13 додано та 3 видалено
  1. 4 3
      lutf8lib.c
  2. 9 0
      testes/utf8.lua

+ 4 - 3
lutf8lib.c

@@ -215,9 +215,10 @@ static int byteoffset (lua_State *L) {
   }
   lua_pushinteger(L, posi + 1);  /* initial position */
   if ((s[posi] & 0x80) != 0) {  /* multi-byte character? */
-    do {
-      posi++;
-    } while (iscontp(s + posi + 1));  /* skip to final byte */
+    if (iscont(s[posi]))
+      return luaL_error(L, "initial position is a continuation byte");
+    while (iscontp(s + posi + 1))
+      posi++;  /* skip to last continuation byte */
   }
   /* else one-byte character: final position is the initial one */
   lua_pushinteger(L, posi + 1);  /* 'posi' now is the final position */

+ 9 - 0
testes/utf8.lua

@@ -152,11 +152,20 @@ checkerror("position out of bounds", utf8.offset, "", 1, -1)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "\x80", 1)
+checkerror("continuation byte", utf8.offset, "\x9c", -1)
 
 -- error in indices for len
 checkerror("out of bounds", utf8.len, "abc", 0, 2)
 checkerror("out of bounds", utf8.len, "abc", 1, 4)
 
+do  -- missing continuation bytes
+  -- get what is available
+  local p, e = utf8.offset("\xE0", 1)
+  assert(p == 1 and e == 1)
+  local p, e = utf8.offset("\xE0\x9e", -1)
+  assert(p == 1 and e == 2)
+end
+
 
 local s = "hello World"
 local t = {string.byte(s, 1, -1)}