lutf8lib.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. /*
  2. ** $Id: lutf8lib.c $
  3. ** Standard library for UTF-8 manipulation
  4. ** See Copyright Notice in lua.h
  5. */
  6. #define lutf8lib_c
  7. #define LUA_LIB
  8. #include "lprefix.h"
  9. #include <limits.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #include "lua.h"
  13. #include "lauxlib.h"
  14. #include "lualib.h"
  15. #include "llimits.h"
  16. #define MAXUNICODE 0x10FFFFu
  17. #define MAXUTF 0x7FFFFFFFu
  18. #define MSGInvalid "invalid UTF-8 code"
  19. #define iscont(c) (((c) & 0xC0) == 0x80)
  20. #define iscontp(p) iscont(*(p))
  21. /* from strlib */
  22. /* translate a relative string position: negative means back from end */
  23. static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
  24. if (pos >= 0) return pos;
  25. else if (0u - (size_t)pos > len) return 0;
  26. else return (lua_Integer)len + pos + 1;
  27. }
  28. /*
  29. ** Decode one UTF-8 sequence, returning NULL if byte sequence is
  30. ** invalid. The array 'limits' stores the minimum value for each
  31. ** sequence length, to check for overlong representations. Its first
  32. ** entry forces an error for non-ASCII bytes with no continuation
  33. ** bytes (count == 0).
  34. */
  35. static const char *utf8_decode (const char *s, l_uint32 *val, int strict) {
  36. static const l_uint32 limits[] =
  37. {~(l_uint32)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
  38. unsigned int c = (unsigned char)s[0];
  39. l_uint32 res = 0; /* final result */
  40. if (c < 0x80) /* ASCII? */
  41. res = c;
  42. else {
  43. int count = 0; /* to count number of continuation bytes */
  44. for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
  45. unsigned int cc = (unsigned char)s[++count]; /* read next byte */
  46. if (!iscont(cc)) /* not a continuation byte? */
  47. return NULL; /* invalid byte sequence */
  48. res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
  49. }
  50. res |= ((l_uint32)(c & 0x7F) << (count * 5)); /* add first byte */
  51. if (count > 5 || res > MAXUTF || res < limits[count])
  52. return NULL; /* invalid byte sequence */
  53. s += count; /* skip continuation bytes read */
  54. }
  55. if (strict) {
  56. /* check for invalid code points; too large or surrogates */
  57. if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
  58. return NULL;
  59. }
  60. if (val) *val = res;
  61. return s + 1; /* +1 to include first byte */
  62. }
  63. /*
  64. ** utf8len(s [, i [, j [, lax]]]) --> number of characters that
  65. ** start in the range [i,j], or nil + current position if 's' is not
  66. ** well formed in that interval
  67. */
  68. static int utflen (lua_State *L) {
  69. lua_Integer n = 0; /* counter for the number of characters */
  70. size_t len; /* string length in bytes */
  71. const char *s = luaL_checklstring(L, 1, &len);
  72. lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  73. lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
  74. int lax = lua_toboolean(L, 4);
  75. luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
  76. "initial position out of bounds");
  77. luaL_argcheck(L, --posj < (lua_Integer)len, 3,
  78. "final position out of bounds");
  79. while (posi <= posj) {
  80. const char *s1 = utf8_decode(s + posi, NULL, !lax);
  81. if (s1 == NULL) { /* conversion error? */
  82. luaL_pushfail(L); /* return fail ... */
  83. lua_pushinteger(L, posi + 1); /* ... and current position */
  84. return 2;
  85. }
  86. posi = ct_diff2S(s1 - s);
  87. n++;
  88. }
  89. lua_pushinteger(L, n);
  90. return 1;
  91. }
  92. /*
  93. ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all
  94. ** characters that start in the range [i,j]
  95. */
  96. static int codepoint (lua_State *L) {
  97. size_t len;
  98. const char *s = luaL_checklstring(L, 1, &len);
  99. lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  100. lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
  101. int lax = lua_toboolean(L, 4);
  102. int n;
  103. const char *se;
  104. luaL_argcheck(L, posi >= 1, 2, "out of bounds");
  105. luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
  106. if (posi > pose) return 0; /* empty interval; return no values */
  107. if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
  108. return luaL_error(L, "string slice too long");
  109. n = (int)(pose - posi) + 1; /* upper bound for number of returns */
  110. luaL_checkstack(L, n, "string slice too long");
  111. n = 0; /* count the number of returns */
  112. se = s + pose; /* string end */
  113. for (s += posi - 1; s < se;) {
  114. l_uint32 code;
  115. s = utf8_decode(s, &code, !lax);
  116. if (s == NULL)
  117. return luaL_error(L, MSGInvalid);
  118. lua_pushinteger(L, l_castU2S(code));
  119. n++;
  120. }
  121. return n;
  122. }
  123. static void pushutfchar (lua_State *L, int arg) {
  124. lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
  125. luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
  126. lua_pushfstring(L, "%U", (long)code);
  127. }
  128. /*
  129. ** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
  130. */
  131. static int utfchar (lua_State *L) {
  132. int n = lua_gettop(L); /* number of arguments */
  133. if (n == 1) /* optimize common case of single char */
  134. pushutfchar(L, 1);
  135. else {
  136. int i;
  137. luaL_Buffer b;
  138. luaL_buffinit(L, &b);
  139. for (i = 1; i <= n; i++) {
  140. pushutfchar(L, i);
  141. luaL_addvalue(&b);
  142. }
  143. luaL_pushresult(&b);
  144. }
  145. return 1;
  146. }
  147. /*
  148. ** offset(s, n, [i]) -> indices where n-th character counting from
  149. ** position 'i' starts and ends; 0 means character at 'i'.
  150. */
  151. static int byteoffset (lua_State *L) {
  152. size_t len;
  153. const char *s = luaL_checklstring(L, 1, &len);
  154. lua_Integer n = luaL_checkinteger(L, 2);
  155. lua_Integer posi = (n >= 0) ? 1 : cast_st2S(len) + 1;
  156. posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
  157. luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
  158. "position out of bounds");
  159. if (n == 0) {
  160. /* find beginning of current byte sequence */
  161. while (posi > 0 && iscontp(s + posi)) posi--;
  162. }
  163. else {
  164. if (iscontp(s + posi))
  165. return luaL_error(L, "initial position is a continuation byte");
  166. if (n < 0) {
  167. while (n < 0 && posi > 0) { /* move back */
  168. do { /* find beginning of previous character */
  169. posi--;
  170. } while (posi > 0 && iscontp(s + posi));
  171. n++;
  172. }
  173. }
  174. else {
  175. n--; /* do not move for 1st character */
  176. while (n > 0 && posi < (lua_Integer)len) {
  177. do { /* find beginning of next character */
  178. posi++;
  179. } while (iscontp(s + posi)); /* (cannot pass final '\0') */
  180. n--;
  181. }
  182. }
  183. }
  184. if (n != 0) { /* did not find given character? */
  185. luaL_pushfail(L);
  186. return 1;
  187. }
  188. lua_pushinteger(L, posi + 1); /* initial position */
  189. if ((s[posi] & 0x80) != 0) { /* multi-byte character? */
  190. if (iscont(s[posi]))
  191. return luaL_error(L, "initial position is a continuation byte");
  192. while (iscontp(s + posi + 1))
  193. posi++; /* skip to last continuation byte */
  194. }
  195. /* else one-byte character: final position is the initial one */
  196. lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */
  197. return 2;
  198. }
  199. static int iter_aux (lua_State *L, int strict) {
  200. size_t len;
  201. const char *s = luaL_checklstring(L, 1, &len);
  202. lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
  203. if (n < len) {
  204. while (iscontp(s + n)) n++; /* go to next character */
  205. }
  206. if (n >= len) /* (also handles original 'n' being negative) */
  207. return 0; /* no more codepoints */
  208. else {
  209. l_uint32 code;
  210. const char *next = utf8_decode(s + n, &code, strict);
  211. if (next == NULL || iscontp(next))
  212. return luaL_error(L, MSGInvalid);
  213. lua_pushinteger(L, l_castU2S(n + 1));
  214. lua_pushinteger(L, l_castU2S(code));
  215. return 2;
  216. }
  217. }
  218. static int iter_auxstrict (lua_State *L) {
  219. return iter_aux(L, 1);
  220. }
  221. static int iter_auxlax (lua_State *L) {
  222. return iter_aux(L, 0);
  223. }
  224. static int iter_codes (lua_State *L) {
  225. int lax = lua_toboolean(L, 2);
  226. const char *s = luaL_checkstring(L, 1);
  227. luaL_argcheck(L, !iscontp(s), 1, MSGInvalid);
  228. lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
  229. lua_pushvalue(L, 1);
  230. lua_pushinteger(L, 0);
  231. return 3;
  232. }
  233. /* pattern to match a single UTF-8 character */
  234. #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
  235. static const luaL_Reg funcs[] = {
  236. {"offset", byteoffset},
  237. {"codepoint", codepoint},
  238. {"char", utfchar},
  239. {"len", utflen},
  240. {"codes", iter_codes},
  241. /* placeholders */
  242. {"charpattern", NULL},
  243. {NULL, NULL}
  244. };
  245. LUAMOD_API int luaopen_utf8 (lua_State *L) {
  246. luaL_newlib(L, funcs);
  247. lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
  248. lua_setfield(L, -2, "charpattern");
  249. return 1;
  250. }