lj_str.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. /*
  2. ** String handling.
  3. ** Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
  4. */
  5. #define lj_str_c
  6. #define LUA_CORE
  7. #include "lj_obj.h"
  8. #include "lj_gc.h"
  9. #include "lj_err.h"
  10. #include "lj_str.h"
  11. #include "lj_char.h"
  12. #include "lj_prng.h"
  13. /* -- String helpers ------------------------------------------------------ */
  14. /* Ordered compare of strings. Assumes string data is 4-byte aligned. */
  15. int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b)
  16. {
  17. MSize i, n = a->len > b->len ? b->len : a->len;
  18. for (i = 0; i < n; i += 4) {
  19. /* Note: innocuous access up to end of string + 3. */
  20. uint32_t va = *(const uint32_t *)(strdata(a)+i);
  21. uint32_t vb = *(const uint32_t *)(strdata(b)+i);
  22. if (va != vb) {
  23. #if LJ_LE
  24. va = lj_bswap(va); vb = lj_bswap(vb);
  25. #endif
  26. i -= n;
  27. if ((int32_t)i >= -3) {
  28. va >>= 32+(i<<3); vb >>= 32+(i<<3);
  29. if (va == vb) break;
  30. }
  31. return va < vb ? -1 : 1;
  32. }
  33. }
  34. return (int32_t)(a->len - b->len);
  35. }
  36. /* Find fixed string p inside string s. */
  37. const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen)
  38. {
  39. if (plen <= slen) {
  40. if (plen == 0) {
  41. return s;
  42. } else {
  43. int c = *(const uint8_t *)p++;
  44. plen--; slen -= plen;
  45. while (slen) {
  46. const char *q = (const char *)memchr(s, c, slen);
  47. if (!q) break;
  48. if (memcmp(q+1, p, plen) == 0) return q;
  49. q++; slen -= (MSize)(q-s); s = q;
  50. }
  51. }
  52. }
  53. return NULL;
  54. }
  55. /* Check whether a string has a pattern matching character. */
  56. int lj_str_haspattern(GCstr *s)
  57. {
  58. const char *p = strdata(s), *q = p + s->len;
  59. while (p < q) {
  60. int c = *(const uint8_t *)p++;
  61. if (lj_char_ispunct(c) && strchr("^$*+?.([%-", c))
  62. return 1; /* Found a pattern matching char. */
  63. }
  64. return 0; /* No pattern matching chars found. */
  65. }
  66. /* -- String hashing ------------------------------------------------------ */
  67. /* Keyed sparse ARX string hash. Constant time. */
  68. static StrHash hash_sparse(uint64_t seed, const char *str, MSize len)
  69. {
  70. /* Constants taken from lookup3 hash by Bob Jenkins. */
  71. StrHash a, b, h = len ^ (StrHash)seed;
  72. if (len >= 4) { /* Caveat: unaligned access! */
  73. a = lj_getu32(str);
  74. h ^= lj_getu32(str+len-4);
  75. b = lj_getu32(str+(len>>1)-2);
  76. h ^= b; h -= lj_rol(b, 14);
  77. b += lj_getu32(str+(len>>2)-1);
  78. } else {
  79. a = *(const uint8_t *)str;
  80. h ^= *(const uint8_t *)(str+len-1);
  81. b = *(const uint8_t *)(str+(len>>1));
  82. h ^= b; h -= lj_rol(b, 14);
  83. }
  84. a ^= h; a -= lj_rol(h, 11);
  85. b ^= a; b -= lj_rol(a, 25);
  86. h ^= b; h -= lj_rol(b, 16);
  87. return h;
  88. }
  89. #if LUAJIT_SECURITY_STRHASH
  90. /* Keyed dense ARX string hash. Linear time. */
  91. static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h,
  92. const char *str, MSize len)
  93. {
  94. StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4));
  95. if (len > 12) {
  96. StrHash a = (StrHash)seed;
  97. const char *pe = str+len-12, *p = pe, *q = str;
  98. do {
  99. a += lj_getu32(p);
  100. b += lj_getu32(p+4);
  101. h += lj_getu32(p+8);
  102. p = q; q += 12;
  103. h ^= b; h -= lj_rol(b, 14);
  104. a ^= h; a -= lj_rol(h, 11);
  105. b ^= a; b -= lj_rol(a, 25);
  106. } while (p < pe);
  107. h ^= b; h -= lj_rol(b, 16);
  108. a ^= h; a -= lj_rol(h, 4);
  109. b ^= a; b -= lj_rol(a, 14);
  110. }
  111. return b;
  112. }
  113. #endif
  114. /* -- String interning ---------------------------------------------------- */
  115. #define LJ_STR_MAXCOLL 32
  116. /* Resize the string interning hash table (grow and shrink). */
  117. void lj_str_resize(lua_State *L, MSize newmask)
  118. {
  119. global_State *g = G(L);
  120. GCRef *newtab, *oldtab = g->str.tab;
  121. MSize i;
  122. /* No resizing during GC traversal or if already too big. */
  123. if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
  124. return;
  125. newtab = lj_mem_newvec(L, newmask+1, GCRef);
  126. memset(newtab, 0, (newmask+1)*sizeof(GCRef));
  127. #if LUAJIT_SECURITY_STRHASH
  128. /* Check which chains need secondary hashes. */
  129. if (g->str.second) {
  130. int newsecond = 0;
  131. /* Compute primary chain lengths. */
  132. for (i = g->str.mask; i != ~(MSize)0; i--) {
  133. GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
  134. while (o) {
  135. GCstr *s = gco2str(o);
  136. MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) :
  137. s->hash;
  138. hash &= newmask;
  139. setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1);
  140. o = gcnext(o);
  141. }
  142. }
  143. /* Mark secondary chains. */
  144. for (i = newmask; i != ~(MSize)0; i--) {
  145. int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL;
  146. newsecond |= secondary;
  147. setgcrefp(newtab[i], secondary);
  148. }
  149. g->str.second = newsecond;
  150. }
  151. #endif
  152. /* Reinsert all strings from the old table into the new table. */
  153. for (i = g->str.mask; i != ~(MSize)0; i--) {
  154. GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
  155. while (o) {
  156. GCobj *next = gcnext(o);
  157. GCstr *s = gco2str(o);
  158. MSize hash = s->hash;
  159. #if LUAJIT_SECURITY_STRHASH
  160. uintptr_t u;
  161. if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */
  162. hash &= newmask;
  163. u = gcrefu(newtab[hash]);
  164. if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */
  165. s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len);
  166. s->hashalg = 1;
  167. hash &= newmask;
  168. u = gcrefu(newtab[hash]);
  169. }
  170. } else { /* String hashed with secondary hash. */
  171. MSize shash = hash_sparse(g->str.seed, strdata(s), s->len);
  172. u = gcrefu(newtab[shash & newmask]);
  173. if (u & 1) {
  174. hash &= newmask;
  175. u = gcrefu(newtab[hash]);
  176. } else { /* Revert string back to primary hash. */
  177. s->hash = shash;
  178. s->hashalg = 0;
  179. hash = (shash & newmask);
  180. }
  181. }
  182. /* NOBARRIER: The string table is a GC root. */
  183. setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
  184. setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1)));
  185. #else
  186. hash &= newmask;
  187. /* NOBARRIER: The string table is a GC root. */
  188. setgcrefr(o->gch.nextgc, newtab[hash]);
  189. setgcref(newtab[hash], o);
  190. #endif
  191. o = next;
  192. }
  193. }
  194. /* Free old table and replace with new table. */
  195. lj_str_freetab(g);
  196. g->str.tab = newtab;
  197. g->str.mask = newmask;
  198. }
  199. #if LUAJIT_SECURITY_STRHASH
  200. /* Rehash and rechain all strings in a chain. */
  201. static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc,
  202. const char *str, MSize len)
  203. {
  204. global_State *g = G(L);
  205. int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */
  206. GCRef *strtab = g->str.tab;
  207. MSize strmask = g->str.mask;
  208. GCobj *o = gcref(strtab[hashc & strmask]);
  209. setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1));
  210. g->str.second = 1;
  211. while (o) {
  212. uintptr_t u;
  213. GCobj *next = gcnext(o);
  214. GCstr *s = gco2str(o);
  215. StrHash hash;
  216. if (ow) { /* Must sweep while rechaining. */
  217. if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */
  218. lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED),
  219. "sweep of undead string");
  220. makewhite(g, o);
  221. } else { /* Free dead string. */
  222. lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED,
  223. "sweep of unlive string");
  224. lj_str_free(g, s);
  225. o = next;
  226. continue;
  227. }
  228. }
  229. hash = s->hash;
  230. if (!s->hashalg) { /* Rehash with secondary hash. */
  231. hash = hash_dense(g->str.seed, hash, strdata(s), s->len);
  232. s->hash = hash;
  233. s->hashalg = 1;
  234. }
  235. /* Rechain. */
  236. hash &= strmask;
  237. u = gcrefu(strtab[hash]);
  238. setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
  239. setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1)));
  240. o = next;
  241. }
  242. /* Try to insert the pending string again. */
  243. return lj_str_new(L, str, len);
  244. }
  245. #endif
  246. /* Reseed String ID from PRNG after random interval < 2^bits. */
  247. #if LUAJIT_SECURITY_STRID == 1
  248. #define STRID_RESEED_INTERVAL 8
  249. #elif LUAJIT_SECURITY_STRID == 2
  250. #define STRID_RESEED_INTERVAL 4
  251. #elif LUAJIT_SECURITY_STRID >= 3
  252. #define STRID_RESEED_INTERVAL 0
  253. #endif
  254. /* Allocate a new string and add to string interning table. */
  255. static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len,
  256. StrHash hash, int hashalg)
  257. {
  258. GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr);
  259. global_State *g = G(L);
  260. uintptr_t u;
  261. newwhite(g, s);
  262. s->gct = ~LJ_TSTR;
  263. s->len = len;
  264. s->hash = hash;
  265. #ifndef STRID_RESEED_INTERVAL
  266. s->sid = g->str.id++;
  267. #elif STRID_RESEED_INTERVAL
  268. if (!g->str.idreseed--) {
  269. uint64_t r = lj_prng_u64(&g->prng);
  270. g->str.id = (StrID)r;
  271. g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL));
  272. }
  273. s->sid = g->str.id++;
  274. #else
  275. s->sid = (StrID)lj_prng_u64(&g->prng);
  276. #endif
  277. s->reserved = 0;
  278. s->hashalg = (uint8_t)hashalg;
  279. /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */
  280. *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0;
  281. memcpy(strdatawr(s), str, len);
  282. /* Add to string hash table. */
  283. hash &= g->str.mask;
  284. u = gcrefu(g->str.tab[hash]);
  285. setgcrefp(s->nextgc, (u & ~(uintptr_t)1));
  286. /* NOBARRIER: The string table is a GC root. */
  287. setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1)));
  288. if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */
  289. lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */
  290. return s; /* Return newly interned string. */
  291. }
  292. /* Intern a string and return string object. */
  293. GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
  294. {
  295. global_State *g = G(L);
  296. if (lenx-1 < LJ_MAX_STR-1) {
  297. MSize len = (MSize)lenx;
  298. StrHash hash = hash_sparse(g->str.seed, str, len);
  299. MSize coll = 0;
  300. int hashalg = 0;
  301. /* Check if the string has already been interned. */
  302. GCobj *o = gcref(g->str.tab[hash & g->str.mask]);
  303. #if LUAJIT_SECURITY_STRHASH
  304. if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */
  305. hashalg = 1;
  306. hash = hash_dense(g->str.seed, hash, str, len);
  307. o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1);
  308. }
  309. #endif
  310. while (o != NULL) {
  311. GCstr *sx = gco2str(o);
  312. if (sx->hash == hash && sx->len == len) {
  313. if (memcmp(str, strdata(sx), len) == 0) {
  314. if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */
  315. return sx; /* Return existing string. */
  316. }
  317. coll++;
  318. }
  319. coll++;
  320. o = gcnext(o);
  321. }
  322. #if LUAJIT_SECURITY_STRHASH
  323. /* Rehash chain if there are too many collisions. */
  324. if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) {
  325. return lj_str_rehash_chain(L, hash, str, len);
  326. }
  327. #endif
  328. /* Otherwise allocate a new string. */
  329. return lj_str_alloc(L, str, len, hash, hashalg);
  330. } else {
  331. if (lenx)
  332. lj_err_msg(L, LJ_ERR_STROV);
  333. return &g->strempty;
  334. }
  335. }
  336. void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s)
  337. {
  338. g->str.num--;
  339. lj_mem_free(g, s, lj_str_size(s->len));
  340. }
  341. void LJ_FASTCALL lj_str_init(lua_State *L)
  342. {
  343. global_State *g = G(L);
  344. g->str.seed = lj_prng_u64(&g->prng);
  345. lj_str_resize(L, LJ_MIN_STRTAB-1);
  346. }