浏览代码

Correction in utf8.offset

Wrong utf-8 character may have no continuation bytes.
Roberto Ierusalimschy 1 周之前
父节点
当前提交
ccb8b307f1
共有 2 个文件被更改,包括 13 次插入3 次删除
  1. 4 3
      lutf8lib.c
  2. 9 0
      testes/utf8.lua

+ 4 - 3
lutf8lib.c

@@ -215,9 +215,10 @@ static int byteoffset (lua_State *L) {
   }
   }
   lua_pushinteger(L, posi + 1);  /* initial position */
   lua_pushinteger(L, posi + 1);  /* initial position */
   if ((s[posi] & 0x80) != 0) {  /* multi-byte character? */
   if ((s[posi] & 0x80) != 0) {  /* multi-byte character? */
-    do {
-      posi++;
-    } while (iscontp(s + posi + 1));  /* skip to final byte */
+    if (iscont(s[posi]))
+      return luaL_error(L, "initial position is a continuation byte");
+    while (iscontp(s + posi + 1))
+      posi++;  /* skip to last continuation byte */
   }
   }
   /* else one-byte character: final position is the initial one */
   /* else one-byte character: final position is the initial one */
   lua_pushinteger(L, posi + 1);  /* 'posi' now is the final position */
   lua_pushinteger(L, posi + 1);  /* 'posi' now is the final position */

+ 9 - 0
testes/utf8.lua

@@ -152,11 +152,20 @@ checkerror("position out of bounds", utf8.offset, "", 1, -1)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
 checkerror("continuation byte", utf8.offset, "\x80", 1)
 checkerror("continuation byte", utf8.offset, "\x80", 1)
+checkerror("continuation byte", utf8.offset, "\x9c", -1)
 
 
 -- error in indices for len
 -- error in indices for len
 checkerror("out of bounds", utf8.len, "abc", 0, 2)
 checkerror("out of bounds", utf8.len, "abc", 0, 2)
 checkerror("out of bounds", utf8.len, "abc", 1, 4)
 checkerror("out of bounds", utf8.len, "abc", 1, 4)
 
 
+do  -- missing continuation bytes
+  -- get what is available
+  local p, e = utf8.offset("\xE0", 1)
+  assert(p == 1 and e == 1)
+  local p, e = utf8.offset("\xE0\x9e", -1)
+  assert(p == 1 and e == 2)
+end
+
 
 
 local s = "hello World"
 local s = "hello World"
 local t = {string.byte(s, 1, -1)}
 local t = {string.byte(s, 1, -1)}