1 年之前 · 814213b65f
--- a/lutf8lib.c
+++ b/lutf8lib.c
@@ -181,8 +181,8 @@ static int utfchar (lua_State *L) {
 
				 
			
 
				 
			
 
				 /*
			
 
				-** offset(s, n, [i])  -> index where n-th character counting from
			
 
				-**   position 'i' starts; 0 means character at 'i'.
			
 
				+** offset(s, n, [i])  -> indices where n-th character counting from
			
 
				+**   position 'i' starts and ends; 0 means character at 'i'.
			
 
				 */
			
 
				 static int byteoffset (lua_State *L) {
			
 
				   size_t len;
			
@@ -217,11 +217,19 @@ static int byteoffset (lua_State *L) {
 
				        }
			
 
				      }
			
 
				   }
			
 
				-  if (n == 0)  /* did it find given character? */
			
 
				-    lua_pushinteger(L, posi + 1);
			
 
				-  else  /* no such character */
			
 
				+  if (n != 0) {  /* did not find given character? */
			
 
				     luaL_pushfail(L);
			
 
				-  return 1;
			
 
				+    return 1;
			
 
				+  }
			
 
				+  lua_pushinteger(L, posi + 1);  /* initial position */
			
 
				+  if ((s[posi] & 0x80) != 0) {  /* multi-byte character? */
			
 
				+    do {
			
 
				+      posi++;
			
 
				+    } while (iscontp(s + posi + 1));  /* skip to final byte */
			
 
				+  }
			
 
				+  /* else one-byte character: final position is the initial one */
			
 
				+  lua_pushinteger(L, posi + 1);  /* 'posi' now is the final position */
			
 
				+  return 2;
			
 
				 }
			
 
				 
			
 
				 
			
--- a/manual/manual.of
+++ b/manual/manual.of
@@ -7958,21 +7958,27 @@ returns @fail plus the position of the first invalid byte.
 
				 
			
 
				 @LibEntry{utf8.offset (s, n [, i])|
			
 
				 
			
 
				-Returns the position (in bytes) where the encoding of the
			
 
				-@id{n}-th character of @id{s}
			
 
				-(counting from position @id{i}) starts.
			
 
				+Returns the the position of the @id{n}-th character of @id{s}
			
 
				+(counting from byte position @id{i}) as two integers:
			
 
				+The index (in bytes) where its encoding starts and the
			
 
				+index (in bytes) where it ends.
			
 
				+
			
 
				+If the specified character is right after the end of @id{s},
			
 
				+the function behaves as if there was a @Char{\0} there.
			
 
				+If the specified character is neither in the subject
			
 
				+nor right after its end,
			
 
				+the function returns @fail.
			
 
				+
			
 
				 A negative @id{n} gets characters before position @id{i}.
			
 
				 The default for @id{i} is 1 when @id{n} is non-negative
			
 
				 and @T{#s + 1} otherwise,
			
 
				 so that @T{utf8.offset(s, -n)} gets the offset of the
			
 
				 @id{n}-th character from the end of the string.
			
 
				-If the specified character is neither in the subject
			
 
				-nor right after its end,
			
 
				-the function returns @fail.
			
 
				 
			
 
				 As a special case,
			
 
				-when @id{n} is 0 the function returns the start of the encoding
			
 
				-of the character that contains the @id{i}-th byte of @id{s}.
			
 
				+when @id{n} is 0 the function returns the start and end
			
 
				+of the encoding of the character that contains the
			
 
				+@id{i}-th byte of @id{s}.
			
 
				 
			
 
				 This function assumes that @id{s} is a valid UTF-8 string.
			
 
				 
			
--- a/testes/utf8.lua
+++ b/testes/utf8.lua
@@ -52,25 +52,35 @@ local function check (s, t, nonstrict)
 
				   for i = 1, #t do assert(t[i] == t1[i]) end   -- 't' is equal to 't1'
			
 
				 
			
 
				   for i = 1, l do   -- for all codepoints
			
 
				-    local pi = utf8.offset(s, i)        -- position of i-th char
			
 
				+    local pi, pie = utf8.offset(s, i)        -- position of i-th char
			
 
				     local pi1 = utf8.offset(s, 2, pi)   -- position of next char
			
 
				+    assert(pi1 == pie + 1)
			
 
				     assert(string.find(string.sub(s, pi, pi1 - 1), justone))
			
 
				     assert(utf8.offset(s, -1, pi1) == pi)
			
 
				     assert(utf8.offset(s, i - l - 1) == pi)
			
 
				     assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
			
 
				     for j = pi, pi1 - 1 do
			
 
				-      assert(utf8.offset(s, 0, j) == pi)
			
 
				+      local off1, off2 = utf8.offset(s, 0, j)
			
 
				+      assert(off1 == pi and off2 == pi1 - 1)
			
 
				     end
			
 
				     for j = pi + 1, pi1 - 1 do
			
 
				       assert(not utf8.len(s, j))
			
 
				     end
			
 
				-   assert(utf8.len(s, pi, pi, nonstrict) == 1)
			
 
				-   assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
			
 
				-   assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
			
 
				-   assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
			
 
				-   assert(utf8.len(s, 1, pi, nonstrict) == i)
			
 
				+    assert(utf8.len(s, pi, pi, nonstrict) == 1)
			
 
				+    assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
			
 
				+    assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
			
 
				+    assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
			
 
				+    assert(utf8.len(s, 1, pi, nonstrict) == i)
			
 
				   end
			
 
				 
			
 
				+  local expected = 1    -- expected position of "current" character
			
 
				+  for i = 1, l + 1 do
			
 
				+    local p, e = utf8.offset(s, i)
			
 
				+    assert(p == expected)
			
 
				+    expected = e + 1
			
 
				+  end
			
 
				+  assert(expected - 1 == #s + 1)
			
 
				+
			
 
				   local i = 0
			
 
				   for p, c in utf8.codes(s, nonstrict) do
			
 
				     i = i + 1
			
@@ -94,20 +104,20 @@ end
 
				 
			
 
				 
			
 
				 do    -- error indication in utf8.len
			
 
				-  local function check (s, p)
			
 
				+  local function checklen (s, p)
			
 
				     local a, b = utf8.len(s)
			
 
				     assert(not a and b == p)
			
 
				   end
			
 
				-  check("abc\xE3def", 4)
			
 
				-  check("\xF4\x9F\xBF", 1)
			
 
				-  check("\xF4\x9F\xBF\xBF", 1)
			
 
				+  checklen("abc\xE3def", 4)
			
 
				+  checklen("\xF4\x9F\xBF", 1)
			
 
				+  checklen("\xF4\x9F\xBF\xBF", 1)
			
 
				   -- spurious continuation bytes
			
 
				-  check("汉字\x80", #("汉字") + 1)
			
 
				-  check("\x80hello", 1)
			
 
				-  check("hel\x80lo", 4)
			
 
				-  check("汉字\xBF", #("汉字") + 1)
			
 
				-  check("\xBFhello", 1)
			
 
				-  check("hel\xBFlo", 4)
			
 
				+  checklen("汉字\x80", #("汉字") + 1)
			
 
				+  checklen("\x80hello", 1)
			
 
				+  checklen("hel\x80lo", 4)
			
 
				+  checklen("汉字\xBF", #("汉字") + 1)
			
 
				+  checklen("\xBFhello", 1)
			
 
				+  checklen("hel\xBFlo", 4)
			
 
				 end
			
 
				 
			
 
				 -- errors in utf8.codes