Quellcode durchsuchen

RELEASE LuaJIT-2.0.0-beta1

Mike Pall vor 15 Jahren
Commit
55b1695971
100 geänderte Dateien mit 38548 neuen und 0 gelöschten Zeilen
  1. 11 0
      .gitignore
  2. 84 0
      Makefile
  3. 16 0
      README
  4. 203 0
      doc/api.html
  5. 166 0
      doc/bluequad-print.css
  6. 303 0
      doc/bluequad.css
  7. 281 0
      doc/changes.html
  8. 84 0
      doc/contact.html
  9. 141 0
      doc/faq.html
  10. BIN
      doc/img/contact.png
  11. 216 0
      doc/install.html
  12. 120 0
      doc/luajit.html
  13. 233 0
      doc/running.html
  14. 235 0
      doc/status.html
  15. 69 0
      dynasm/dasm_proto.h
  16. 467 0
      dynasm/dasm_x86.h
  17. 1799 0
      dynasm/dasm_x86.lua
  18. 1070 0
      dynasm/dynasm.lua
  19. 41 0
      etc/strict.lua
  20. 1 0
      lib/.gitignore
  21. 182 0
      lib/bc.lua
  22. 19 0
      lib/dis_x64.lua
  23. 824 0
      lib/dis_x86.lua
  24. 567 0
      lib/dump.lua
  25. 156 0
      lib/v.lua
  26. 8 0
      src/.gitignore
  27. 326 0
      src/Makefile
  28. 139 0
      src/Makefile.dep
  29. 438 0
      src/buildvm.c
  30. 106 0
      src/buildvm.h
  31. 220 0
      src/buildvm_asm.c
  32. 206 0
      src/buildvm_fold.c
  33. 365 0
      src/buildvm_lib.c
  34. 303 0
      src/buildvm_peobj.c
  35. 3592 0
      src/buildvm_x86.dasc
  36. 159 0
      src/lauxlib.h
  37. 438 0
      src/lib_aux.c
  38. 560 0
      src/lib_base.c
  39. 74 0
      src/lib_bit.c
  40. 366 0
      src/lib_debug.c
  41. 37 0
      src/lib_init.c
  42. 538 0
      src/lib_io.c
  43. 589 0
      src/lib_jit.c
  44. 188 0
      src/lib_math.c
  45. 249 0
      src/lib_os.c
  46. 508 0
      src/lib_package.c
  47. 790 0
      src/lib_string.c
  48. 276 0
      src/lib_table.c
  49. 6 0
      src/lj.supp
  50. 1232 0
      src/lj_alloc.c
  51. 17 0
      src/lj_alloc.h
  52. 1046 0
      src/lj_api.c
  53. 88 0
      src/lj_arch.h
  54. 3324 0
      src/lj_asm.c
  55. 17 0
      src/lj_asm.h
  56. 17 0
      src/lj_bc.c
  57. 235 0
      src/lj_bc.h
  58. 44 0
      src/lj_ctype.c
  59. 40 0
      src/lj_ctype.h
  60. 226 0
      src/lj_def.h
  61. 284 0
      src/lj_dispatch.c
  62. 64 0
      src/lj_dispatch.h
  63. 763 0
      src/lj_err.c
  64. 40 0
      src/lj_err.h
  65. 134 0
      src/lj_errmsg.h
  66. 18 0
      src/lj_ff.h
  67. 84 0
      src/lj_frame.h
  68. 185 0
      src/lj_func.c
  69. 25 0
      src/lj_func.h
  70. 800 0
      src/lj_gc.c
  71. 102 0
      src/lj_gc.h
  72. 739 0
      src/lj_gdbjit.c
  73. 22 0
      src/lj_gdbjit.h
  74. 461 0
      src/lj_ir.c
  75. 429 0
      src/lj_ir.h
  76. 128 0
      src/lj_iropt.h
  77. 279 0
      src/lj_jit.h
  78. 393 0
      src/lj_lex.c
  79. 63 0
      src/lj_lex.h
  80. 216 0
      src/lj_lib.c
  81. 84 0
      src/lj_lib.h
  82. 260 0
      src/lj_mcode.c
  83. 23 0
      src/lj_mcode.h
  84. 358 0
      src/lj_meta.c
  85. 33 0
      src/lj_meta.h
  86. 41 0
      src/lj_obj.c
  87. 676 0
      src/lj_obj.h
  88. 79 0
      src/lj_opt_dce.c
  89. 1415 0
      src/lj_opt_fold.c
  90. 358 0
      src/lj_opt_loop.c
  91. 550 0
      src/lj_opt_mem.c
  92. 430 0
      src/lj_opt_narrow.c
  93. 2198 0
      src/lj_parse.c
  94. 15 0
      src/lj_parse.h
  95. 2136 0
      src/lj_record.c
  96. 17 0
      src/lj_record.h
  97. 286 0
      src/lj_snap.c
  98. 19 0
      src/lj_snap.h
  99. 255 0
      src/lj_state.c
  100. 31 0
      src/lj_state.h

+ 11 - 0
.gitignore

@@ -0,0 +1,11 @@
+*.[oa]
+*.so
+*.obj
+*.lib
+*.exp
+*.dll
+*.exe
+*.manifest
+*.dmp
+*.swp
+.tags

+ 84 - 0
Makefile

@@ -0,0 +1,84 @@
+##############################################################################
+# LuaJIT top level Makefile for installation. Requires GNU Make.
+#
+# Suitable for POSIX platforms (Linux, *BSD, OSX etc.).
+# Note: src/Makefile has many more configurable options.
+#
+# ##### This Makefile is NOT useful for installation on Windows! #####
+# For MSVC, please follow the instructions given in src/msvcbuild.bat.
+# For MinGW and Cygwin, cd to src and run make with the Makefile there.
+# NYI: add wininstall.bat
+#
+# Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+##############################################################################
+
+BASEVER= 2.0.0
+VERSION= 2.0.0-beta1
+
+##############################################################################
+#
+# Change the installation path as needed and modify src/luaconf.h accordingly.
+# Note: PREFIX must be an absolute path!
+#
+PREFIX= /usr/local
+##############################################################################
+
+INSTALL_BIN= $(PREFIX)/bin
+INSTALL_NAME= luajit-$(VERSION)
+INSTALL_T= $(INSTALL_BIN)/$(INSTALL_NAME)
+INSTALL_TSYM= $(INSTALL_BIN)/luajit
+INSTALL_INC= $(PREFIX)/include/luajit-$(BASEVER)
+INSTALL_JITLIB= $(PREFIX)/share/luajit-$(VERSION)/jit
+
+MKDIR= mkdir -p
+SYMLINK= ln -f -s
+INSTALL_X= install -m 0755
+INSTALL_F= install -m 0644
+
+FILES_T= luajit
+FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
+FILES_JITLIB= bc.lua v.lua dump.lua dis_x86.lua dis_x64.lua vmdef.lua
+
+##############################################################################
+
+INSTALL_DEP= src/luajit
+
+all $(INSTALL_DEP):
+	@echo "==== Building LuaJIT $(VERSION) ===="
+	$(MAKE) -C src
+	@echo "==== Successfully built LuaJIT $(VERSION) ===="
+
+install: $(INSTALL_DEP)
+	@echo "==== Installing LuaJIT $(VERSION) to $(PREFIX) ===="
+	$(MKDIR) $(INSTALL_BIN) $(INSTALL_INC) $(INSTALL_JITLIB)
+	cd src && $(INSTALL_X) $(FILES_T) $(INSTALL_T)
+	cd src && $(INSTALL_F) $(FILES_INC) $(INSTALL_INC)
+	cd lib && $(INSTALL_F) $(FILES_JITLIB) $(INSTALL_JITLIB)
+	@echo "==== Successfully installed LuaJIT $(VERSION) to $(PREFIX) ===="
+	@echo ""
+	@echo "Note: the beta releases deliberately do NOT install a symlink for luajit"
+	@echo "You can do this now by running this command (with sudo):"
+	@echo ""
+	@echo "  $(SYMLINK) $(INSTALL_NAME) $(INSTALL_TSYM)"
+	@echo ""
+
+##############################################################################
+
+amalg:
+	@echo "Building LuaJIT $(VERSION)"
+	$(MAKE) -C src amalg
+
+clean:
+	$(MAKE) -C src clean
+
+cleaner:
+	$(MAKE) -C src cleaner
+
+distclean:
+	$(MAKE) -C src distclean
+
+SUB_TARGETS= amalg clean cleaner distclean
+
+.PHONY: all install $(SUB_TARGETS)
+
+##############################################################################

+ 16 - 0
README

@@ -0,0 +1,16 @@
+README for LuaJIT 2.0.0-beta1
+-----------------------------
+
+LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language.
+
+Project Homepage: http://luajit.org/
+
+LuaJIT is Copyright (C) 2005-2009 Mike Pall.
+LuaJIT is free software, released under the MIT/X license.
+See full Copyright Notice in src/luajit.h
+
+Documentation for LuaJIT is available in HTML format.
+Please point your favorite browser to:
+
+ doc/luajit.html
+

+ 203 - 0
doc/api.html

@@ -0,0 +1,203 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>API Extensions</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>API Extensions</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a class="current" href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+LuaJIT is fully upwards-compatible with Lua 5.1. It supports all
+<a href="http://www.lua.org/manual/5.1/manual.html#5"><span class="ext">&raquo;</span>&nbsp;standard Lua
+library functions</a> and the full set of
+<a href="http://www.lua.org/manual/5.1/manual.html#3"><span class="ext">&raquo;</span>&nbsp;Lua/C API
+functions</a>.
+</p>
+<p>
+LuaJIT is also fully ABI-compatible to Lua 5.1 at the linker/dynamic
+loader level. This means you can compile a C&nbsp;module against the
+standard Lua headers and load the same shared library from either Lua
+or LuaJIT.
+</p>
+
+<h2 id="bit"><tt>bit.*</tt> &mdash; Bitwise Operations</h2>
+<p>
+LuaJIT supports all bitwise operations as defined by
+<a href="http://bitop.luajit.org"><span class="ext">&raquo;</span>&nbsp;Lua BitOp</a>:
+</p>
+<pre class="code">
+bit.tobit  bit.tohex  bit.bnot    bit.band bit.bor  bit.bxor
+bit.lshift bit.rshift bit.arshift bit.rol  bit.ror  bit.bswap
+</pre>
+<p>
+This module is a LuaJIT built-in &mdash; you don't need to download or
+install Lua BitOp. The Lua BitOp site has full documentation for all
+<a href="http://bitop.luajit.org/api.html"><span class="ext">&raquo;</span>&nbsp;Lua BitOp API functions</a>.
+</p>
+<p>
+Please make sure to <tt>require</tt> the module before using any of
+its functions:
+</p>
+<pre class="code">
+local bit = require("bit")
+</pre>
+<p>
+An already installed Lua BitOp module is ignored by LuaJIT.
+This way you can use bit operations from both Lua and LuaJIT on a
+shared installation.
+</p>
+
+<h2 id="jit"><tt>jit.*</tt> &mdash; JIT compiler control</h2>
+<p>
+The functions in this built-in module control the behavior
+of the JIT compiler engine.
+</p>
+
+<h3 id="jit_onoff"><tt>jit.on()<br>
+jit.off()</tt></h3>
+<p>
+Turns the whole JIT compiler on (default) or off.
+</p>
+<p>
+These functions are typically used with the command line options
+<tt>-j on</tt> or <tt>-j off</tt>.
+</p>
+
+<h3 id="jit_flush"><tt>jit.flush()</tt></h3>
+<p>
+Flushes the whole cache of compiled code.
+</p>
+
+<h3 id="jit_flush_tr"><tt>jit.flush(tr)</tt></h3>
+<p>
+Flushes the code for the specified root trace and all of its
+side traces from the cache.
+</p>
+
+<h3 id="jit_onoff_func"><tt>jit.on(func|true [,true|false])<br>
+jit.off(func|true [,true|false])<br>
+jit.flush(func|true [,true|false])</tt></h3>
+<p>
+<tt>jit.on</tt> enables JIT compilation for a Lua function (this is
+the default).
+</p>
+<p>
+<tt>jit.off</tt> disables JIT compilation for a Lua function and
+flushes any already compiled code from the code cache.
+</p>
+<p>
+<tt>jit.flush</tt> flushes the code, but doesn't affect the
+enable/disable status.
+</p>
+<p>
+The current function, i.e. the Lua function calling this library
+function, can also be specified by passing <tt>true</tt> as the first
+argument.
+</p>
+<p>
+If the second argument is <tt>true</tt>, JIT compilation is also
+enabled, disabled or flushed recursively for all subfunctions of a
+function. With <tt>false</tt> only the subfunctions are affected.
+</p>
+<p>
+The <tt>jit.on</tt> and <tt>jit.off</tt> functions only set a flag
+which is checked when the function is about to be compiled. They do
+not trigger immediate compilation.
+</p>
+<p>
+Typical usage is <tt>jit.off(true, true)</tt> in the main chunk
+of a module to turn off JIT compilation for the whole module for
+debugging purposes.
+</p>
+
+<h3 id="jit_version"><tt>jit.version</tt></h3>
+<p>
+Contains the LuaJIT version string.
+</p>
+
+<h3 id="jit_version_num"><tt>jit.version_num</tt></h3>
+<p>
+Contains the version number of the LuaJIT core. Version xx.yy.zz
+is represented by the decimal number xxyyzz.
+</p>
+
+<h3 id="jit_arch"><tt>jit.arch</tt></h3>
+<p>
+Contains the target architecture name (CPU and optional ABI).
+</p>
+
+<h2 id="jit_opt"><tt>jit.opt.*</tt> &mdash; JIT compiler optimization control</h2>
+<p>
+This module provides the backend for the <tt>-O</tt> command line
+option.
+</p>
+<p>
+You can also use it programmatically, e.g.:
+</p>
+<pre class="code">
+jit.opt.start(2) -- same as -O2
+jit.opt.start("-dce")
+jit.opt.start("hotloop=10", "hotexit=2")
+</pre>
+<p>
+Unlike in LuaJIT 1.x, the module is built-in and
+<b>optimization is turned on by default!</b>
+It's no longer necessary to run <tt>require("jit.opt").start()</tt>,
+which was one of the ways to enable optimization.
+</p>
+
+<h2 id="jit_util"><tt>jit.util.*</tt> &mdash; JIT compiler introspection</h2>
+<p>
+This module holds functions to introspect the bytecode, generated
+traces, the IR and the generated machine code. The functionality
+provided by this module is still in flux and therefore undocumented.
+</p>
+<p>
+The debug modules <tt>-jbc</tt>, <tt>-jv</tt> and <tt>-jdump</tt> make
+extensive use of these functions. Please check out their source code,
+if you want to know more.
+</p>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 166 - 0
doc/bluequad-print.css

@@ -0,0 +1,166 @@
+/* Copyright (C) 2004-2009 Mike Pall.
+ *
+ * You are welcome to use the general ideas of this design for your own sites.
+ * But please do not steal the stylesheet, the layout or the color scheme.
+ */
+body {
+  font-family: serif;
+  font-size: 11pt;
+  margin: 0 3em;
+  padding: 0;
+  border: none;
+}
+a:link, a:visited, a:hover, a:active {
+  text-decoration: none;
+  background: transparent;
+  color: #0000ff;
+}
+h1, h2, h3 {
+  font-family: sans-serif;
+  font-weight: bold;
+  text-align: left;
+  margin: 0.5em 0;
+  padding: 0;
+}
+h1 {
+  font-size: 200%;
+}
+h2 {
+  font-size: 150%;
+}
+h3 {
+  font-size: 125%;
+}
+p {
+  margin: 0 0 0.5em 0;
+  padding: 0;
+}
+ul, ol {
+  margin: 0.5em 0;
+  padding: 0 0 0 2em;
+}
+ul {
+  list-style: outside square;
+}
+ol {
+  list-style: outside decimal;
+}
+li {
+  margin: 0;
+  padding: 0;
+}
+dl {
+  margin: 1em 0;
+  padding: 1em;
+  border: 1px solid black;
+}
+dt {
+  font-weight: bold;
+  margin: 0;
+  padding: 0;
+}
+dt sup {
+  float: right;
+  margin-left: 1em;
+}
+dd {
+  margin: 0.5em 0 0 2em;
+  padding: 0;
+}
+table {
+  table-layout: fixed;
+  width: 100%;
+  margin: 1em 0;
+  padding: 0;
+  border: 1px solid black;
+  border-spacing: 0;
+  border-collapse: collapse;
+}
+tr {
+  margin: 0;
+  padding: 0;
+  border: none;
+}
+td {
+  text-align: left;
+  margin: 0;
+  padding: 0.2em 0.5em;
+  border-top: 1px solid black;
+  border-bottom: 1px solid black;
+}
+tr.separate td {
+  border-top: double;
+}
+tt, pre, code, kbd, samp {
+  font-family: monospace;
+  font-size: 75%;
+}
+kbd {
+  font-weight: bolder;
+}
+blockquote, pre {
+  margin: 1em 2em;
+  padding: 0;
+}
+img {
+  border: none;
+  vertical-align: baseline;
+  margin: 0;
+  padding: 0;
+}
+img.left {
+  float: left;
+  margin: 0.5em 1em 0.5em 0;
+}
+img.right {
+  float: right;
+  margin: 0.5em 0 0.5em 1em;
+}
+.flush {
+  clear: both;
+  visibility: hidden;
+}
+.hide, .noprint, #nav {
+  display: none !important;
+}
+.pagebreak {
+  page-break-before: always;
+}
+#site {
+  text-align: right;
+  font-family: sans-serif;
+  font-weight: bold;
+  margin: 0 1em;
+  border-bottom: 1pt solid black;
+}
+#site a {
+  font-size: 1.2em;
+}
+#site a:link, #site a:visited {
+  text-decoration: none;
+  font-weight: bold;
+  background: transparent;
+  color: #ffffff;
+}
+#logo {
+  color: #ff8000;
+}
+#head {
+  clear: both;
+  margin: 0 1em;
+}
+#main {
+  line-height: 1.3;
+  text-align: justify;
+  margin: 1em;
+}
+#foot {
+  clear: both;
+  font-size: 80%;
+  text-align: center;
+  margin: 0 1.25em;
+  padding: 0.5em 0 0 0;
+  border-top: 1pt solid black;
+  page-break-before: avoid;
+  page-break-after: avoid;
+}

+ 303 - 0
doc/bluequad.css

@@ -0,0 +1,303 @@
+/* Copyright (C) 2004-2009 Mike Pall.
+ *
+ * You are welcome to use the general ideas of this design for your own sites.
+ * But please do not steal the stylesheet, the layout or the color scheme.
+ */
+/* colorscheme:
+ *
+ * site  |  head   #4162bf/white   | #6078bf/#e6ecff
+ * ------+------   ----------------+-------------------
+ * nav   |  main   #bfcfff         | #e6ecff/black
+ *
+ * nav:  hiback   loback     #c5d5ff #b9c9f9
+ *       hiborder loborder   #e6ecff #97a7d7
+ *       link     hover      #2142bf #ff0000
+ *
+ * link: link visited hover  #2142bf #8122bf #ff0000
+ *
+ * main: boxback  boxborder  #f0f4ff #bfcfff
+ */
+body {
+  font-family: Verdana, Arial, Helvetica, sans-serif;
+  font-size: 10pt;
+  margin: 0;
+  padding: 0;
+  border: none;
+  background: #e0e0e0;
+  color: #000000;
+}
+a:link {
+  text-decoration: none;
+  background: transparent;
+  color: #2142bf;
+}
+a:visited {
+  text-decoration: none;
+  background: transparent;
+  color: #8122bf;
+}
+a:hover, a:active {
+  text-decoration: underline;
+  background: transparent;
+  color: #ff0000;
+}
+h1, h2, h3 {
+  font-weight: bold;
+  text-align: left;
+  margin: 0.5em 0;
+  padding: 0;
+  background: transparent;
+}
+h1 {
+  font-size: 200%;
+  line-height: 3em; /* really 6em relative to body, match #site span */
+  margin: 0;
+}
+h2 {
+  font-size: 150%;
+  color: #606060;
+}
+h3 {
+  font-size: 125%;
+  color: #404040;
+}
+p {
+  max-width: 600px;
+  margin: 0 0 0.5em 0;
+  padding: 0;
+}
+b {
+  color: #404040;
+}
+ul, ol {
+  max-width: 600px;
+  margin: 0.5em 0;
+  padding: 0 0 0 2em;
+}
+ul {
+  list-style: outside square;
+}
+ol {
+  list-style: outside decimal;
+}
+li {
+  margin: 0;
+  padding: 0;
+}
+dl {
+  max-width: 600px;
+  margin: 1em 0;
+  padding: 1em;
+  border: 1px solid #bfcfff;
+  background: #f0f4ff;
+}
+dt {
+  font-weight: bold;
+  margin: 0;
+  padding: 0;
+}
+dt sup {
+  float: right;
+  margin-left: 1em;
+  color: #808080;
+}
+dt a:visited {
+  text-decoration: none;
+  color: #2142bf;
+}
+dt a:hover, dt a:active {
+  text-decoration: none;
+  color: #ff0000;
+}
+dd {
+  margin: 0.5em 0 0 2em;
+  padding: 0;
+}
+div.tablewrap { /* for IE *sigh* */
+  max-width: 600px;
+}
+table {
+  table-layout: fixed;
+  border-spacing: 0;
+  border-collapse: collapse;
+  max-width: 600px;
+  width: 100%;
+  margin: 1em 0;
+  padding: 0;
+  border: 1px solid #bfcfff;
+}
+tr {
+  margin: 0;
+  padding: 0;
+  border: none;
+}
+tr.odd {
+  background: #f0f4ff;
+}
+tr.separate td {
+  border-top: 1px solid #bfcfff;
+}
+td {
+  text-align: left;
+  margin: 0;
+  padding: 0.2em 0.5em;
+  border: none;
+}
+tt, code, kbd, samp {
+  font-family: Courier New, Courier, monospace;
+  line-height: 1.2;
+  font-size: 110%;
+}
+kbd {
+  font-weight: bolder;
+}
+blockquote, pre {
+  max-width: 600px;
+  margin: 1em 2em;
+  padding: 0;
+}
+pre {
+  line-height: 1.1;
+}
+pre.code {
+  line-height: 1.4;
+  margin: 0.5em 0 1em 0.5em;
+  padding: 0.5em 1em;
+  border: 1px solid #bfcfff;
+  background: #f0f4ff;
+}
+img {
+  border: none;
+  vertical-align: baseline;
+  margin: 0;
+  padding: 0;
+}
+img.left {
+  float: left;
+  margin: 0.5em 1em 0.5em 0;
+}
+img.right {
+  float: right;
+  margin: 0.5em 0 0.5em 1em;
+}
+.indent {
+  padding-left: 1em;
+}
+.flush {
+  clear: both;
+  visibility: hidden;
+}
+.hide, .noscreen {
+  display: none !important;
+}
+.ext {
+  color: #ff8000;
+}
+#site {
+  clear: both;
+  float: left;
+  width: 13em;
+  text-align: center;
+  font-weight: bold;
+  margin: 0;
+  padding: 0;
+  background: transparent;
+  color: #ffffff;
+}
+#site a {
+  font-size: 200%;
+}
+#site a:link, #site a:visited {
+  text-decoration: none;
+  font-weight: bold;
+  background: transparent;
+  color: #ffffff;
+}
+#site span {
+  line-height: 3em; /* really 6em relative to body, match h1 */
+}
+#logo {
+  color: #ffb380;
+}
+#head {
+  margin: 0;
+  padding: 0 0 0 2em;
+  border-left: solid 13em #4162bf;
+  border-right: solid 3em #6078bf;
+  background: #6078bf;
+  color: #e6ecff;
+}
+#nav {
+  clear: both;
+  float: left;
+  overflow: hidden;
+  text-align: left;
+  line-height: 1.5;
+  width: 13em;
+  padding-top: 1em;
+  background: transparent;
+}
+#nav ul {
+  list-style: none outside;
+  margin: 0;
+  padding: 0;
+}
+#nav li {
+  margin: 0;
+  padding: 0;
+}
+#nav a {
+  display: block;
+  text-decoration: none;
+  font-weight: bold;
+  margin: 0;
+  padding: 2px 1em;
+  border-top: 1px solid transparent;
+  border-bottom: 1px solid transparent;
+  background: transparent;
+  color: #2142bf;
+}
+#nav a:hover, #nav a:active {
+  text-decoration: none;
+  border-top: 1px solid #97a7d7;
+  border-bottom: 1px solid #e6ecff;
+  background: #b9c9f9;
+  color: #ff0000;
+}
+#nav a.current, #nav a.current:hover, #nav a.current:active {
+  border-top: 1px solid #e6ecff;
+  border-bottom: 1px solid #97a7d7;
+  background: #c5d5ff;
+  color: #2142bf;
+}
+#nav ul ul a {
+  padding: 0 1em 0 2em;
+}
+#main {
+  line-height: 1.5;
+  text-align: left;
+  margin: 0;
+  padding: 1em 2em;
+  border-left: solid 13em #bfcfff;
+  border-right: solid 3em #e6ecff;
+  background: #e6ecff;
+}
+#foot {
+  clear: both;
+  font-size: 80%;
+  text-align: center;
+  margin: 0;
+  padding: 0.5em;
+  background: #6078bf;
+  color: #ffffff;
+}
+#foot a:link, #foot a:visited {
+  text-decoration: underline;
+  background: transparent;
+  color: #ffffff;
+}
+#foot a:hover, #foot a:active {
+  text-decoration: underline;
+  background: transparent;
+  color: #bfcfff;
+}

+ 281 - 0
doc/changes.html

@@ -0,0 +1,281 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>LuaJIT Change History</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+<style type="text/css">
+div.major { max-width: 600px; padding: 1em; margin: 1em 0 1em 0; }
+</style>
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>LuaJIT Change History</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a class="current" href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+This is a list of changes between the released versions of LuaJIT.<br>
+The current <span style="color: #c00000;">development version</span> is <strong>LuaJIT&nbsp;2.0.0-beta1</strong>.<br>
+The current <span style="color: #0000c0;">stable version</span> is <strong>LuaJIT&nbsp;1.1.5</strong>.
+</p>
+<p>
+Please check the
+<a href="http://luajit.org/luajit_changes.html"><span class="ext">&raquo;</span>&nbsp;Online Change History</a>
+to see whether newer versions are available.
+</p>
+
+<div class="major" style="background: #ffd0d0;">
+<h2 id="LuaJIT-2.0.0-beta1">LuaJIT 2.0.0-beta1 &mdash; 2009-10-31</h2>
+<ul>
+<li>This is the first public release of LuaJIT 2.0.</li>
+<li>The whole VM has been rewritten from the ground up, so there's
+no point in listing differences over earlier versions.</li>
+</ul>
+</div>
+
+<div class="major" style="background: #d0d0ff;">
+<h2 id="LuaJIT-1.1.5">LuaJIT 1.1.5 &mdash; 2008-10-25</h2>
+<ul>
+<li>Merged with Lua 5.1.4. Fixes all
+<a href="http://www.lua.org/bugs.html#5.1.3"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.3</a>.</li>
+</ul>
+
+<h2 id="LuaJIT-1.1.4">LuaJIT 1.1.4 &mdash; 2008-02-05</h2>
+<ul>
+<li>Merged with Lua 5.1.3. Fixes all
+<a href="http://www.lua.org/bugs.html#5.1.2"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.2</a>.</li>
+<li>Fixed possible (but unlikely) stack corruption while compiling
+<tt>k^x</tt> expressions.</li>
+<li>Fixed DynASM template for cmpss instruction.</li>
+</ul>
+
+<h2 id="LuaJIT-1.1.3">LuaJIT 1.1.3 &mdash; 2007-05-24</h2>
+<ul>
+<li>Merged with Lua 5.1.2. Fixes all
+<a href="http://www.lua.org/bugs.html#5.1.1"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.1</a>.</li>
+<li>Merged pending Lua 5.1.x fixes: "return -nil" bug, spurious count hook call.</li>
+<li>Remove a (sometimes) wrong assertion in <tt>luaJIT_findpc()</tt>.</li>
+<li>DynASM now allows labels for displacements and <tt>.aword</tt>.</li>
+<li>Fix some compiler warnings for DynASM glue (internal API change).</li>
+<li>Correct naming for SSSE3 (temporarily known as SSE4) in DynASM and x86 disassembler.</li>
+<li>The loadable debug modules now handle redirection to stdout
+(e.g. <tt>-j&nbsp;trace=-</tt>).</li>
+</ul>
+
+<h2 id="LuaJIT-1.1.2">LuaJIT 1.1.2 &mdash; 2006-06-24</h2>
+<ul>
+<li>Fix MSVC inline assembly: use only local variables with
+<tt>lua_number2int()</tt>.</li>
+<li>Fix "attempt to call a thread value" bug on Mac OS X:
+make values of consts used as lightuserdata keys unique
+to avoid joining by the compiler/linker.</li>
+</ul>
+
+<h2 id="LuaJIT-1.1.1">LuaJIT 1.1.1 &mdash; 2006-06-20</h2>
+<ul>
+<li>Merged with Lua 5.1.1. Fixes all
+<a href="http://www.lua.org/bugs.html#5.1"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1</a>.</li>
+<li>Enforce (dynamic) linker error for EXE/DLL version mismatches.</li>
+<li>Minor changes to DynASM: faster preprocessing, smaller encoding
+for some immediates.</li>
+</ul>
+<p>
+This release is in sync with Coco 1.1.1 (see the
+<a href="http://coco.luajit.org/changes.html"><span class="ext">&raquo;</span>&nbsp;Coco Change History</a>).
+</p>
+
+<h2 id="LuaJIT-1.1.0">LuaJIT 1.1.0 &mdash; 2006-03-13</h2>
+<ul>
+<li>Merged with Lua 5.1 (final).</li>
+
+<li>New JIT call frame setup:
+<ul>
+<li>The C stack is kept 16 byte aligned (faster).
+Mandatory for Mac OS X on Intel, too.</li>
+<li>Faster calling conventions for internal C helper functions.</li>
+<li>Better instruction scheduling for function prologue, OP_CALL and
+OP_RETURN.</li>
+</ul></li>
+
+<li>Miscellaneous optimizations:
+<ul>
+<li>Faster loads of FP constants. Remove narrow-to-wide store-to-load
+forwarding stalls.</li>
+<li>Use (scalar) SSE2 ops (if the CPU supports it) to speed up slot moves
+and FP to integer conversions.</li>
+<li>Optimized the two-argument form of <tt>OP_CONCAT</tt> (<tt>a..b</tt>).</li>
+<li>Inlined <tt>OP_MOD</tt> (<tt>a%b</tt>).
+With better accuracy than the C variant, too.</li>
+<li>Inlined <tt>OP_POW</tt> (<tt>a^b</tt>). Unroll <tt>x^k</tt> or
+use <tt>k^x = 2^(log2(k)*x)</tt> or call <tt>pow()</tt>.</li>
+</ul></li>
+
+<li>Changes in the optimizer:
+<ul>
+<li>Improved hinting for table keys derived from table values
+(<tt>t1[t2[x]]</tt>).</li>
+<li>Lookup hinting now works with arbitrary object types and
+supports index chains, too.</li>
+<li>Generate type hints for arithmetic and comparison operators,
+OP_LEN, OP_CONCAT and OP_FORPREP.</li>
+<li>Remove several hint definitions in favour of a generic COMBINE hint.</li>
+<li>Complete rewrite of <tt>jit.opt_inline</tt> module
+(ex <tt>jit.opt_lib</tt>).</li>
+</ul></li>
+
+<li>Use adaptive deoptimization:
+<ul>
+<li>If runtime verification of a contract fails, the affected
+instruction is recompiled and patched on-the-fly.
+Regular programs will trigger deoptimization only occasionally.</li>
+<li>This avoids generating code for uncommon fallback cases
+most of the time. Generated code is up to 30% smaller compared to
+LuaJIT&nbsp;1.0.3.</li>
+<li>Deoptimization is used for many opcodes and contracts:
+<ul>
+<li>OP_CALL, OP_TAILCALL: type mismatch for callable.</li>
+<li>Inlined calls: closure mismatch, parameter number and type mismatches.</li>
+<li>OP_GETTABLE, OP_SETTABLE: table or key type and range mismatches.</li>
+<li>All arithmetic and comparison operators, OP_LEN, OP_CONCAT,
+OP_FORPREP: operand type and range mismatches.</li>
+</ul></li>
+<li>Complete redesign of the debug and traceback info
+(bytecode &harr; mcode) to support deoptimization.
+Much more flexible and needs only 50% of the space.</li>
+<li>The modules <tt>jit.trace</tt>, <tt>jit.dumphints</tt> and
+<tt>jit.dump</tt> handle deoptimization.</li>
+</ul></li>
+
+<li>Inlined many popular library functions
+(for commonly used arguments only):
+<ul>
+<li>Most <tt>math.*</tt> functions (the 18 most used ones)
+[2x-10x faster].</li>
+<li><tt>string.len</tt>, <tt>string.sub</tt> and <tt>string.char</tt>
+[2x-10x faster].</li>
+<li><tt>table.insert</tt>, <tt>table.remove</tt> and <tt>table.getn</tt>
+[3x-5x faster].</li>
+<li><tt>coroutine.yield</tt> and <tt>coroutine.resume</tt>
+[3x-5x faster].</li>
+<li><tt>pairs</tt>, <tt>ipairs</tt> and the corresponding iterators
+[8x-15x faster].</li>
+</ul></li>
+
+<li>Changes in the core and loadable modules and the stand-alone executable:
+<ul>
+<li>Added <tt>jit.version</tt>, <tt>jit.version_num</tt>
+and <tt>jit.arch</tt>.</li>
+<li>Reorganized some internal API functions (<tt>jit.util.*mcode*</tt>).</li>
+<li>The <tt>-j dump</tt> output now shows JSUB names, too.</li>
+<li>New x86 disassembler module written in pure Lua. No dependency
+on ndisasm anymore. Flexible API, very compact (500 lines)
+and complete (x87, MMX, SSE, SSE2, SSE3, SSSE3, privileged instructions).</li>
+<li><tt>luajit -v</tt> prints the LuaJIT version and copyright
+on a separate line.</li>
+</ul></li>
+
+<li>Added SSE, SSE2, SSE3 and SSSE3 support to DynASM.</li>
+<li>Miscellaneous doc changes. Added a section about
+<a href="luajit_install.html#embedding">embedding LuaJIT</a>.</li>
+</ul>
+<p>
+This release is in sync with Coco 1.1.0 (see the
+<a href="http://coco.luajit.org/changes.html"><span class="ext">&raquo;</span>&nbsp;Coco Change History</a>).
+</p>
+</div>
+
+<div class="major" style="background: #ffffd0;">
+<h2 id="LuaJIT-1.0.3">LuaJIT 1.0.3 &mdash; 2005-09-08</h2>
+<ul>
+<li>Even more docs.</li>
+<li>Unified closure checks in <tt>jit.*</tt>.</li>
+<li>Fixed some range checks in <tt>jit.util.*</tt>.</li>
+<li>Fixed __newindex call originating from <tt>jit_settable_str()</tt>.</li>
+<li>Merged with Lua 5.1 alpha (including early bugfixes).</li>
+</ul>
+<p>
+This is the first public release of LuaJIT.
+</p>
+
+<h2 id="LuaJIT-1.0.2">LuaJIT 1.0.2 &mdash; 2005-09-02</h2>
+<ul>
+<li>Add support for flushing the Valgrind translation cache <br>
+(<tt>MYCFLAGS= -DUSE_VALGRIND</tt>).</li>
+<li>Add support for freeing executable mcode memory to the <tt>mmap()</tt>-based
+variant for POSIX systems.</li>
+<li>Reorganized the C&nbsp;function signature handling in
+<tt>jit.opt_lib</tt>.</li>
+<li>Changed to index-based hints for inlining C&nbsp;functions.
+Still no support in the backend for inlining.</li>
+<li>Hardcode <tt>HEAP_CREATE_ENABLE_EXECUTE</tt> value if undefined.</li>
+<li>Misc. changes to the <tt>jit.*</tt> modules.</li>
+<li>Misc. changes to the Makefiles.</li>
+<li>Lots of new docs.</li>
+<li>Complete doc reorg.</li>
+</ul>
+<p>
+Not released because Lua 5.1 alpha came out today.
+</p>
+
+<h2 id="LuaJIT-1.0.1">LuaJIT 1.0.1 &mdash; 2005-08-31</h2>
+<ul>
+<li>Missing GC step in <tt>OP_CONCAT</tt>.</li>
+<li>Fix result handling for C &ndash;> JIT calls.</li>
+<li>Detect CPU feature bits.</li>
+<li>Encode conditional moves (<tt>fucomip</tt>) only when supported.</li>
+<li>Add fallback instructions for FP compares.</li>
+<li>Add support for <tt>LUA_COMPAT_VARARG</tt>. Still disabled by default.</li>
+<li>MSVC needs a specific place for the <tt>CALLBACK</tt> attribute
+(David Burgess).</li>
+<li>Misc. doc updates.</li>
+</ul>
+<p>
+Interim non-public release.
+Special thanks to Adam D. Moss for reporting most of the bugs.
+</p>
+
+<h2 id="LuaJIT-1.0.0">LuaJIT 1.0.0 &mdash; 2005-08-29</h2>
+<p>
+This is the initial non-public release of LuaJIT.
+</p>
+</div>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 84 - 0
doc/contact.html

@@ -0,0 +1,84 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>Contact</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>Contact</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+Please send general questions to the
+<a href="http://www.lua.org/lua-l.html"><span class="ext">&raquo;</span>&nbsp;Lua mailing list</a>.
+You can also send any questions you have directly to me:
+</p>
+
+<script type="text/javascript">
+<!--
+var xS="@-: .0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZa<b>cdefghijklmnopqrstuvwxyz"
+function xD(s)
+{var len=s.length;var r="";for(var i=0;i<len;i++)
+{var c=s.charAt(i);var n=xS.indexOf(c);if(n!=-1)
+c=xS.charAt(66-n);r+=c;}
+document.write("<"+"p>"+r+"<"+"/p>\n");}
+//-->
+</script>
+<script type="text/javascript">
+<!--
+xD("ewYKA7vu-EIwslx7 K9A.t41C")
+//--></script>
+<noscript>
+<p><img src="img/contact.png" alt="Contact info in image" width="170" height="13">
+</p>
+</noscript>
+
+<h2>Copyright</h2>
+<p>
+All documentation is
+Copyright &copy; 2005-2009 Mike Pall.
+</p>
+
+
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 141 - 0
doc/faq.html

@@ -0,0 +1,141 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>Frequently Asked Questions (FAQ)</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+<style type="text/css">
+dd { margin-left: 1.5em; }
+</style>
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>Frequently Asked Questions (FAQ)</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a class="current" href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<dl>
+<dt>Q: Where can I learn more about Lua and LuaJIT?</dt>
+<dd>
+<ul style="padding: 0;">
+<li>The <a href="http://lua.org"><span class="ext">&raquo;</span>&nbsp;main Lua.org site</a> has complete
+<a href="http://www.lua.org/docs.html"><span class="ext">&raquo;</span>&nbsp;documentation</a> of the language
+and links to books and papers about Lua.</li>
+<li>The community-managed <a href="http://lua-users.org/wiki/"><span class="ext">&raquo;</span>&nbsp;Lua Wiki</a>
+has information about diverse topics.</li>
+<li>The primary source of information for the latest developments surrounding
+Lua is the <a href="http://www.lua.org/lua-l.html"><span class="ext">&raquo;</span>&nbsp;Lua mailing list</a>.
+You can check out the <a href="http://lua-users.org/lists/lua-l/"><span class="ext">&raquo;</span>&nbsp;mailing
+list archive</a> or
+<a href="http://bazar2.conectiva.com.br/mailman/listinfo/lua"><span class="ext">&raquo;</span>&nbsp;subscribe</a>
+to the list (you need to be subscribed before posting).<br>
+This is also the place where announcements and discussions about LuaJIT
+take place.</li>
+</ul>
+</dl>
+
+<dl>
+<dt>Q: Where can I learn more about the compiler technology used by LuaJIT?</dt>
+<dd>
+I'm planning to write more documentation about the internals of LuaJIT.
+In the meantime, please use the following Google Scholar searches
+to find relevant papers:<br>
+Search for: <a href="http://scholar.google.com/scholar?q=Trace+Compiler"><span class="ext">&raquo;</span>&nbsp;Trace Compiler</a><br>
+Search for: <a href="http://scholar.google.com/scholar?q=JIT+Compiler"><span class="ext">&raquo;</span>&nbsp;JIT Compiler</a><br>
+Search for: <a href="http://scholar.google.com/scholar?q=Dynamic+Language+Optimizations"><span class="ext">&raquo;</span>&nbsp;Dynamic Language Optimizations</a><br>
+Search for: <a href="http://scholar.google.com/scholar?q=SSA+Form"><span class="ext">&raquo;</span>&nbsp;SSA Form</a><br>
+Search for: <a href="http://scholar.google.com/scholar?q=Linear+Scan+Register+Allocation"><span class="ext">&raquo;</span>&nbsp;Linear Scan Register Allocation</a><br>
+And, you know, reading the source is of course the only way to enlightenment. :-)
+</dd>
+</dl>
+
+<dl>
+<dt>Q: Why do I get this error: "attempt to index global 'arg' (a nil value)"?<br>
+Q: My vararg functions fail after switching to LuaJIT!</dt>
+<dd>LuaJIT is compatible to the Lua 5.1 language standard. It doesn't
+support the implicit <tt>arg</tt> parameter for old-style vararg
+functions from Lua 5.0.<br>Please convert your code to the
+<a href="http://www.lua.org/manual/5.1/manual.html#2.5.9"><span class="ext">&raquo;</span>&nbsp;Lua 5.1
+vararg syntax</a>.</dd>
+</dl>
+
+<dl>
+<dt>Q: Sometimes Ctrl-C fails to stop my Lua program. Why?</dt>
+<dd>The interrupt signal handler sets a Lua debug hook. But this is
+currently ignored by compiled code (this will eventually be fixed). If
+your program is running in a tight loop and never falls back to the
+interpreter, the debug hook never runs and can't throw the
+"interrupted!" error.<br> In the meantime you have to press Ctrl-C
+twice to get stop your program. That's similar to when it's stuck
+running inside a C function under the Lua interpreter.</dd>
+</dl>
+
+<dl>
+<dt>Q: Why doesn't my favorite power-patch for Lua apply against LuaJIT?</dt>
+<dd>Because it's a completely redesigned VM and has very little code
+in common with Lua anymore. Also, if the patch introduces changes to
+the Lua semantics, this would need to be reflected everywhere in the
+VM, from the interpreter up to all stages of the compiler.<br> Please
+use only standard Lua language constructs. For many common needs you
+can use source transformations or use wrapper or proxy functions.
+The compiler will happily optimize away such indirections.</dd>
+</dl>
+
+<dl>
+<dt>Q: Lua runs everywhere. Why doesn't LuaJIT support my CPU?</dt>
+<dd>Because it's a compiler &mdash; it needs to generate native
+machine code. This means the code generator must be ported to each
+architecture. And the fast interpreter is written in assembler and
+must be ported, too. This is quite an undertaking.<br> Currently only
+x86 CPUs are supported. x64 support is in the works. Other
+architectures will follow with sufficient demand and/or
+sponsoring.</dd>
+</dl>
+
+<dl>
+<dt>Q: When will feature X be added? When will the next version be released?</dt>
+<dd>When it's ready.<br>
+C'mon, it's open source &mdash; I'm doing it on my own time and you're
+getting it for free. You can either contribute a patch or sponsor
+the development of certain features, if they are important to you.
+</dd>
+</dl>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

BIN
doc/img/contact.png


+ 216 - 0
doc/install.html

@@ -0,0 +1,216 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>Installation</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>Installation</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a class="current" href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+LuaJIT is only distributed as a source package. This page explains
+how to build and install LuaJIT with different operating systems
+and C&nbsp;compilers.
+</p>
+<p>
+For the impatient (on POSIX systems):
+</p>
+<pre class="code">
+make &amp;&amp; sudo make install
+</pre>
+<p>
+LuaJIT currently builds out-of-the box on all popular x86 systems
+(Linux, Windows, OSX etc.). It builds and runs fine as a 32&nbsp;bit
+application under x64-based systems, too.
+</p>
+
+<h2>Configuring LuaJIT</h2>
+<p>
+The standard configuration should work fine for most installations.
+Usually there is no need to tweak the settings, except when you want to
+install to a non-standard path. The following three files hold all
+user-configurable settings:
+</p>
+<ul>
+<li><tt>src/luaconf.h</tt> sets some configuration variables, in
+particular the default paths for loading modules.</li>
+<li><tt>Makefile</tt> has settings for installing LuaJIT (POSIX
+only).</li>
+<li><tt>src/Makefile</tt> has settings for compiling LuaJIT under POSIX,
+MinGW and Cygwin.</li>
+<li><tt>src/msvcbuild.bat</tt> has settings for compiling LuaJIT with
+MSVC.</li>
+</ul>
+<p>
+Please read the instructions given in these files, before changing
+any settings.
+</p>
+
+<h2 id="posix">POSIX Systems (Linux, OSX, *BSD etc.)</h2>
+<h3>Prerequisites</h3>
+<p>
+Depending on your distribution, you may need to install a package for
+GCC (GCC 3.4 or later required), the development headers and/or a
+complete SDK.
+</p>
+<p>
+E.g. on a current Debian/Ubuntu, install <tt>libc6-dev</tt>
+with the package manager. Currently LuaJIT only builds as a 32&nbsp;bit
+application, so you actually need to install <tt>libc6-dev-i386</tt>
+when building on an x64 OS.
+</p>
+<p>
+Download the current source package (pick the .tar.gz), if you haven't
+already done so. Move it to a directory of your choice, open a
+terminal window and change to this directory. Now unpack the archive
+and change to the newly created directory:
+</p>
+<pre class="code">
+tar zxf LuaJIT-2.0.0-beta1.tar.gz
+cd LuaJIT-2.0.0-beta1
+</pre>
+<h3>Building LuaJIT</h3>
+<p>
+The supplied Makefiles try to auto-detect the settings needed for your
+operating system and your compiler. They need to be run with GNU Make,
+which is probably the default on your system, anyway. Simply run:
+</p>
+<pre class="code">
+make
+</pre>
+<h3>Installing LuaJIT</h3>
+<p>
+The top-level Makefile installs LuaJIT by default under
+<tt>/usr/local</tt>, i.e. the executable ends up in
+<tt>/usr/local/bin</tt> and so on. You need to have root privileges
+to write to this path. So, assuming sudo is installed on your system,
+run the following command and enter your sudo password:
+</p>
+<pre class="code">
+sudo make install
+</pre>
+<p>
+Otherwise specify the directory prefix as an absolute path, e.g.:
+</p>
+<pre class="code">
+sudo make install PREFIX=/opt/lj2
+</pre>
+<p>
+But note that the installation prefix and the prefix for the module paths
+(configured in <tt>src/luaconf.h</tt>) must match.
+</p>
+<p style="color: #c00000;">
+Note: to avoid overwriting a previous version, the beta test releases
+only install the LuaJIT executable under the versioned name (i.e.
+<tt>luajit-2.0.0-beta1</tt>). You probably want to create a symlink
+for convenience, with a command like this:
+</p>
+<pre class="code" style="color: #c00000;">
+sudo ln -sf luajit-2.0.0-beta1 /usr/local/bin/luajit
+</pre>
+
+<h2 id="windows">Windows Systems</h2>
+<h3>Prerequisites</h3>
+<p>
+Either install one of the open source SDKs
+(<a href="http://mingw.org/"><span class="ext">&raquo;</span>&nbsp;MinGW</a> or
+<a href="http://www.cygwin.com/"><span class="ext">&raquo;</span>&nbsp;Cygwin</a>) which come with modified
+versions of GCC plus the required development headers.
+</p>
+<p>
+Or install Microsoft's Visual C++ (MSVC) &mdash; the freely downloadable
+<a href="http://www.microsoft.com/Express/VC/"><span class="ext">&raquo;</span>&nbsp;Express Edition</a>
+works just fine.
+</p>
+<p>
+Next, download the source package and unpack it using an archive manager
+(e.g. the Windows Explorer) to a directory of your choice.
+</p>
+<h3>Building with MSVC</h3>
+<p>
+Open a "Visual Studio .NET Command Prompt" and <tt>cd</tt> to the
+directory where you've unpacked the sources. Then run this command:
+</p>
+<pre class="code">
+cd src
+msvcbuild
+</pre>
+<p>
+Then follow the installation instructions below.
+</p>
+<h3>Building with MinGW or Cygwin</h3>
+<p>
+Open a command prompt window and make sure the MinGW or Cygwin programs
+are in your path. Then <tt>cd</tt> to the directory where
+you've unpacked the sources and run this command for MinGW:
+</p>
+<pre class="code">
+cd src
+mingw32-make
+</pre>
+<p>
+Or this command for Cygwin:
+</p>
+<pre class="code">
+cd src
+make
+</pre>
+<p>
+Then follow the installation instructions below.
+</p>
+<h3>Installing LuaJIT</h3>
+<p>
+Copy <tt>luajit.exe</tt> and <tt>lua51.dll</tt>
+to a newly created directory (any location is ok). Add <tt>lua</tt>
+and <tt>lua\jit</tt> directories below it and copy all Lua files
+from the <tt>lib</tt> directory of the distribution to the latter directory.
+</p>
+<p>
+There are no hardcoded
+absolute path names &mdash; all modules are loaded relative to the
+directory where <tt>luajit.exe</tt> is installed
+(see <tt>src/luaconf.h</tt>).
+</p>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 120 - 0
doc/luajit.html

@@ -0,0 +1,120 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>LuaJIT</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>LuaJIT</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a class="current" href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+LuaJIT is a <b>Just-In-Time Compiler</b> for the Lua<sup>*</sup>
+programming language.
+</p>
+<p>
+LuaJIT is Copyright &copy; 2005-2008 Mike Pall.
+LuaJIT is open source software, released under the
+<a href="http://www.opensource.org/licenses/mit-license.php"><span class="ext">&raquo;</span>&nbsp;MIT/X license</a>.
+</p>
+<p class="indent" style="color: #606060;">
+* Lua is a powerful, dynamic and light-weight programming language
+designed for extending applications. Lua is also frequently used as a
+general-purpose, stand-alone language. More information about
+Lua can be found at: <a href="http://www.lua.org/"><span class="ext">&raquo;</span>&nbsp;http://www.lua.org/</a>
+</p>
+<h2>Compatibility</h2>
+<p>
+LuaJIT implements the full set of language features defined by Lua 5.1.
+The virtual machine (VM) is <b>API- and ABI-compatible</b> to the
+standard Lua interpreter and can be deployed as a drop-in replacement.
+</p>
+<p>
+LuaJIT offers more performance, at the expense of portability. It
+currently runs on all popular operating systems based on <b>x86 CPUs</b>
+(Linux, Windows, OSX etc.). It will be ported to x64 CPUs and other
+platforms in the future, based on user demand and sponsoring.
+</p>
+
+<h2>Overview</h2>
+<p>
+LuaJIT has been successfully used as a <b>scripting middleware</b> in
+games, 3D modellers, numerical simulations, trading platforms and many
+other specialty applications. It combines high flexibility with high
+performance and an unmatched <b>low memory footprint</b>: less than
+<b>120K</b> for the VM plus less than <b>80K</b> for the JIT compiler.
+</p>
+<p>
+LuaJIT has been in continuous development since 2005. It's widely
+considered to be <b>one of the fastest dynamic language
+implementations</b>. It has outperfomed other dynamic languages on many
+cross-language benchmarks since its first release &mdash; often by a
+substantial margin. Only now, in 2009, other dynamic language VMs are
+starting to catch up with the performance of LuaJIT 1.x &hellip;
+</p>
+<p>
+2009 also marks the first release of the long-awaited <b>LuaJIT 2.0</b>.
+The whole VM has been rewritten from the ground up and relentlessly
+optimized for performance. It combines a high-speed interpreter,
+written in assembler, with a state-of-the-art JIT compiler.
+</p>
+<p>
+An innovative <b>trace compiler</b> is integrated with advanced,
+SSA-based optimizations and a highly tuned code generation backend. This
+allows a substantial reduction of the overhead associated with dynamic
+language features. It's destined to break into the performance range
+traditionally reserved for offline, static language compilers.
+</p>
+
+<h2>More ...</h2>
+<p>
+Click on the LuaJIT sub-topics in the navigation bar to learn more
+about LuaJIT.
+</p>
+<p><p>
+Click on the Logo in the upper left corner to visit
+the LuaJIT project page on the web. All other links to online
+resources are marked with a '<span class="ext">&raquo;</span>'.
+</p>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 233 - 0
doc/running.html

@@ -0,0 +1,233 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>Running LuaJIT</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+<style type="text/css">
+table.opt {
+  line-height: 1.2;
+}
+tr.opthead td {
+  font-weight: bold;
+}
+td.flag_name {
+  width: 4em;
+}
+td.flag_level {
+  width: 2em;
+  text-align: center;
+}
+td.param_name {
+  width: 6em;
+}
+td.param_default {
+  width: 4em;
+  text-align: right;
+}
+</style>
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>Running LuaJIT</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a class="current" href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+LuaJIT has only a single stand-alone executable, called <tt>luajit</tt> on
+POSIX systems or <tt>luajit.exe</tt> on Windows. It can be used to run simple
+Lua statements or whole Lua applications from the command line. It has an
+interactive mode, too.
+</p>
+<p class="indent" style="color: #c00000;">
+Note: the beta test releases only install under the versioned name on
+POSIX systems (to avoid overwriting a previous version). You either need
+to type <tt>luajit-2.0.0-beta1</tt> to start it or create a symlink
+with a command like this:
+</p>
+<pre class="code" style="color: #c00000;">
+sudo ln -sf luajit-2.0.0-beta1 /usr/local/bin/luajit
+</pre>
+<p>
+Unlike previous versions <b>optimization is turned on by default</b> in
+LuaJIT 2.0!<br>It's no longer necessary to use <tt>luajit&nbsp;-O</tt>.
+</p>
+
+<h2 id="options">Command Line Options</h2>
+<p>
+The <tt>luajit</tt> stand-alone executable is just a slightly modified
+version of the regular <tt>lua</tt> stand-alone executable.
+It supports the same basic options, too. <tt>luajit&nbsp;-h</tt>
+prints a short list of the available options. Please have a look at the
+<a href="http://www.lua.org/manual/5.1/manual.html#6"><span class="ext">&raquo;</span>&nbsp;Lua manual</a>
+for details.
+</p>
+<p>
+Two additional options control the behavior of LuaJIT:
+</p>
+
+<h3 id="opt_j"><tt>-j cmd[=arg[,arg...]]</tt></h3>
+<p>
+This option performs a LuaJIT control command or activates one of the
+loadable extension modules. The command is first looked up in the
+<tt>jit.*</tt> library. If no matching function is found, a module
+named <tt>jit.&lt;cmd&gt;</tt> is loaded and the <tt>start()</tt>
+function of the module is called with the specified arguments (if
+any). The space between <tt>-j</tt> and <tt>cmd</tt> is optional.
+</p>
+<p>
+Here are the available LuaJIT control commands:
+</p>
+<ul>
+<li id="j_on"><tt>-jon</tt> &mdash; Turns the JIT compiler on (default).</li>
+<li id="j_off"><tt>-joff</tt> &mdash; Turns the JIT compiler off (only use the interpreter).</li>
+<li id="j_flush"><tt>-jflush</tt> &mdash; Flushes the whole cache of compiled code.</li>
+<li id="j_v"><tt>-jv</tt> &mdash; Shows verbose information about the progress of the JIT compiler.</li>
+<li id="j_dump"><tt>-jdump</tt> &mdash; Dumps the code and structures used in various compiler stages.</li>
+</ul>
+<p>
+The <tt>-jv</tt> and <tt>-jdump</tt> commands are extension modules
+written in Lua. They are mainly used for debugging the JIT compiler
+itself. For a description of their options and output format, please
+read the comment block at the start of their source.
+They can be found in the <tt>lib</tt> directory of the source
+distribution or installed under the <tt>jit</tt> directory. By default
+this is <tt>/usr/local/share/luajit-2.0.0-beta1/jit</tt> on POSIX
+systems.
+</p>
+
+<h3 id="opt_O"><tt>-O[level]</tt><br>
+<tt>-O[+]flag</tt> <tt>-O-flag</tt><br>
+<tt>-Oparam=value</tt></h3>
+<p>
+This options allows fine-tuned control of the optimizations used by
+the JIT compiler. This is mainly intended for debugging LuaJIT itself.
+Please note that the JIT compiler is extremly fast (we are talking
+about the microsecond to millisecond range). Disabling optimizations
+doesn't have any visible impact on its overhead, but usually generates
+code that runs slower.
+</p>
+<p>
+The first form sets an optimization level &mdash; this enables a
+specific mix of optimization flags. <tt>-O0</tt> turns off all
+optimizations and higher numbers enable more optimizations. Omitting
+the level (i.e. just <tt>-O</tt>) sets the default optimization level,
+which is <tt>-O3</tt> in the current version.
+</p>
+<p>
+The second form adds or removes individual optimization flags.
+The third form sets a parameter for the VM or the JIT compiler
+to a specific value.
+</p>
+<p>
+You can either use this option multiple times (like <tt>-Ocse
+-O-dce -Ohotloop=10</tt>) or separate several settings with a comma
+(like <tt>-O+cse,-dce,hotloop=10</tt>). The settings are applied from
+left to right and later settings override earlier ones. You can freely
+mix the three forms, but note that setting an optimization level
+overrides all earlier flags.
+</p>
+<p>
+Here are the available flags and at what optimization levels they
+are enabled:
+</p>
+<table class="opt">
+<tr class="opthead">
+<td class="flag_name">Flag</td>
+<td class="flag_level">-O1</td>
+<td class="flag_level">-O2</td>
+<td class="flag_level">-O3</td>
+<td class="flag_desc">&nbsp;</td>
+</tr>
+<tr class="odd separate">
+<td class="flag_name">fold</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_desc">Constant Folding, Simplifications and Reassociation</td></tr>
+<tr class="even">
+<td class="flag_name">cse</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_desc">Common-Subexpression Elimination</td></tr>
+<tr class="odd">
+<td class="flag_name">dce</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_desc">Dead-Code Elimination</td></tr>
+<tr class="even">
+<td class="flag_name">narrow</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_desc">Narrowing of numbers to integers</td></tr>
+<tr class="odd">
+<td class="flag_name">loop</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_level">&bull;</td><td class="flag_desc">Loop Optimizations (code hoisting)</td></tr>
+<tr class="even">
+<td class="flag_name">fwd</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Load Forwarding (L2L) and Store Forwarding (S2L)</td></tr>
+<tr class="odd">
+<td class="flag_name">dse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Dead-Store Elimination</td></tr>
+<tr class="even">
+<td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
+</table>
+<p>
+Here are the parameters and their default settings:
+</p>
+<table class="opt">
+<tr class="opthead">
+<td class="param_name">Parameter</td>
+<td class="param_default">Default</td>
+<td class="param_desc">&nbsp;</td>
+</tr>
+<tr class="odd separate">
+<td class="param_name">maxtrace</td><td class="param_default">1000</td><td class="param_desc">Max. number of traces in the cache</td></tr>
+<tr class="even">
+<td class="param_name">maxrecord</td><td class="param_default">2000</td><td class="param_desc">Max. number of recorded IR instructions</td></tr>
+<tr class="odd">
+<td class="param_name">maxirconst</td><td class="param_default">500</td><td class="param_desc">Max. number of IR constants of a trace</td></tr>
+<tr class="even">
+<td class="param_name">maxside</td><td class="param_default">100</td><td class="param_desc">Max. number of side traces of a root trace</td></tr>
+<tr class="odd">
+<td class="param_name">maxsnap</td><td class="param_default">100</td><td class="param_desc">Max. number of snapshots for a trace</td></tr>
+<tr class="even separate">
+<td class="param_name">hotloop</td><td class="param_default">57</td><td class="param_desc">Number of iterations to detect a hot loop</td></tr>
+<tr class="odd">
+<td class="param_name">hotexit</td><td class="param_default">10</td><td class="param_desc">Number of taken exits to start a side trace</td></tr>
+<tr class="even">
+<td class="param_name">tryside</td><td class="param_default">4</td><td class="param_desc">Number of attempts to compile a side trace</td></tr>
+<tr class="odd separate">
+<td class="param_name">instunroll</td><td class="param_default">4</td><td class="param_desc">Max. unroll factor for instable loops</td></tr>
+<tr class="even">
+<td class="param_name">loopunroll</td><td class="param_default">7</td><td class="param_desc">Max. unroll factor for loop ops in side traces</td></tr>
+<tr class="odd">
+<td class="param_name">callunroll</td><td class="param_default">3</td><td class="param_desc">Max. unroll factor for pseudo-recursive calls</td></tr>
+<tr class="even separate">
+<td class="param_name">sizemcode</td><td class="param_default">32</td><td class="param_desc">Size of each machine code area in KBytes (Windows: 64K)</td></tr>
+<tr class="odd">
+<td class="param_name">maxmcode</td><td class="param_default">512</td><td class="param_desc">Max. total size of all machine code areas in KBytes</td></tr>
+</table>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 235 - 0
doc/status.html

@@ -0,0 +1,235 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<title>Status &amp; Roadmap</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="Author" content="Mike Pall">
+<meta name="Copyright" content="Copyright (C) 2005-2009, Mike Pall">
+<meta name="Language" content="en">
+<link rel="stylesheet" type="text/css" href="bluequad.css" media="screen">
+<link rel="stylesheet" type="text/css" href="bluequad-print.css" media="print">
+<style type="text/css">
+ul li { padding-bottom: 0.3em; }
+</style>
+</head>
+<body>
+<div id="site">
+<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
+</div>
+<div id="head">
+<h1>Status &amp; Roadmap</h1>
+</div>
+<div id="nav">
+<ul><li>
+<a href="luajit.html">LuaJIT</a>
+<ul><li>
+<a href="install.html">Installation</a>
+</li><li>
+<a href="running.html">Running</a>
+</li><li>
+<a href="api.html">API Extensions</a>
+</li></ul>
+</li><li>
+<a class="current" href="status.html">Status</a>
+<ul><li>
+<a href="changes.html">Changes</a>
+</li></ul>
+</li><li>
+<a href="faq.html">FAQ</a>
+</li><li>
+<a href="http://luajit.org/download.html">Download <span class="ext">&raquo;</span></a>
+</li></ul>
+</div>
+<div id="main">
+<p>
+The <span style="color: #0000c0;">LuaJIT 1.x</span> series represents
+the current <span style="color: #0000c0;">stable branch</span>. As of
+this writing there have been no open bugs since about a year. So, if
+you need a rock-solid VM, you are encouraged to fetch the latest
+release of LuaJIT 1.x from the <a href="http://luajit.org/download.html"><span class="ext">&raquo;</span>&nbsp;Download</a>
+page.
+</p>
+<p>
+<span style="color: #c00000;">LuaJIT 2.0</span> is the currently active
+<span style="color: #c00000;">development branch</span>.
+It has <b>Beta Test</b> status and is still undergoing
+substantial changes. It's expected to quickly mature within the next
+months. You should definitely start to evaluate it for new projects
+right now. But deploying it in production environments is not yet
+recommended.
+</p>
+
+<h2>Current Status</h2>
+<p>
+This is a list of the things you should know about the LuaJIT 2.0 beta test:
+</p>
+<ul>
+<li>
+The JIT compiler can only generate code for CPUs with <b>SSE2</b> at the
+moment. I.e. you need at least a P4, Core 2/i5/i7 or K8/K10 to use it. I
+plan to fix this during the beta phase and add support for emitting x87
+instructions to the backend.
+</li>
+<li>
+Obviously there will be many <b>bugs</b> in a VM which has been
+rewritten from the ground up. Please report your findings together with
+the circumstances needed to reproduce the bug. If possible reduce the
+problem down to a simple test cases.<br>
+There is no formal bug tracker at the moment. The best place for
+discussion is the
+<a href="http://www.lua.org/lua-l.html"><span class="ext">&raquo;</span>&nbsp;Lua mailing list</a>. Of course
+you may also send your bug report directly to me, especially when they
+contains lengthy debug output. Please check the
+<a href="contact.html">Contact</a> page for details.
+</li>
+<li>
+The VM is complete in the sense that it <b>should</b> run all Lua code
+just fine. It's considered a serious bug if the VM crashes or produces
+unexpected results &mdash; please report it. There are only very few
+known incompatibilities with standard Lua:
+<ul>
+<li>
+The Lua <b>debug API</b> is missing a couple of features (call/return
+hooks) and shows slightly different behavior (no per-coroutine hooks).
+</li>
+<li>
+Most other issues you're likely to find (e.g. with the existing test
+suites) are differences in the <b>implementation-defined</b> behavior.
+These either have a good reason (like early tail call resolving which
+may cause differences in error reporting), are arbitrary design choices
+or are due to quirks in the VM. The latter cases may get fixed if a
+demonstrable need is shown.
+</li>
+</ul>
+</li>
+<li>
+The <b>JIT compiler</b> is not complete (yet) and falls back to the
+interpreter in some cases. All of this works transparently, so unless
+you use -jv, you'll probably never notice (the interpreter is quite
+fast, too). Here are the known issues:
+<ul>
+<li>
+Many known issues cause a <b>NYI</b> (not yet implemented) trace abort
+message. E.g. for calls to vararg functions or many string library
+functions. Reporting these is only mildly useful, except if you have good
+example code that shows the problem. Obviously, reports accompanied with
+a patch to fix the issue are more than welcome. But please check back
+with me, before writing major improvements, to avoid duplication of
+effort.
+</li>
+<li>
+<b>Recursion</b> is not traced yet. Often no trace will be generated at
+all or some unroll limit will catch it and aborts the trace.
+</li>
+<li>
+The trace compiler currently does not back off specialization for
+function call dispatch. It should really fall back to specializing on
+the prototype, not the closure identity. This can lead to the so-called
+"trace explosion" problem with <b>closure-heavy programming</b>. The
+trace linking heuristics prevent this, but in the worst case this
+means the code always falls back to the interpreter.
+</li>
+<li>
+<b>Trace management</b> needs more tuning: better blacklisting of aborted
+traces, less drastic countermeasures against trace explosion and better
+heuristics in general.
+</li>
+<li>
+Some checks are missing in the JIT-compiled code for obscure situations
+with <b>open upvalues aliasing</b> one of the SSA slots later on (or
+vice versa). Bonus points, if you can find a real world test case for
+this.
+</li>
+</ul>
+</li>
+</ul>
+
+<h2>Roadmap</h2>
+<p>
+Rather than stating exact release dates (I'm well known for making
+spectacularly wrong guesses), this roadmap lists the general project
+plan, sorted by priority, as well as ideas for the future:
+</p>
+<ul>
+<li>
+The main goal right now is to stabilize LuaJIT 2.0 and get it out of
+beta test. <b>Correctness</b> has priority over completeness. This
+implies the first stable release will certainly NOT compile every
+library function call and will fall back to the interpreter from time
+to time. This is perfectly ok, since it still executes all Lua code,
+just not at the highest possible speed.
+</li>
+<li>
+The next step is to get it to compile more library functions and handle
+more cases where the compiler currently bails out. This doesn't mean it
+will compile every corner case. It's much more important that it
+performs well in a majority of use cases. Every compiler has to make
+these trade-offs &mdash; <b>completeness</b> just cannot be the
+overriding goal for a low-footprint, low-overhead JIT compiler.
+</li>
+<li>
+More <b>optimizations</b> will be added in parallel to the last step on
+an as-needed basis. Array-bounds-check (ABC) removal, sinking of stores
+to aggregates and sinking of allocations are high on the list. Faster
+handling of NEWREF and better alias analysis are desirable, too. More
+complex optimizations with less pay-off, such as value-range-propagation
+(VRP) will have to wait.
+</li>
+<li>
+LuaJIT 2.0 has been designed with <b>portability</b> in mind.
+Nonetheless, it compiles to native code and needs to be adapted to each
+architecture. Porting the compiler backend is probably the easier task,
+but a key element of its design is the fast interpreter, written in
+machine-specific assembler.<br>
+The code base and the internal structures are already prepared for
+easier porting to 64 bit architectures. The most likely next target is a
+port to <b>x64</b>, but this will have to wait until the x86 port
+stabilizes. Other ports will follow &mdash; companies which are
+interested in sponsoring a port to a particular architecture, please
+<a href="contact.html">contact me</a>.
+</li>
+<li>
+There are some planned <b>structural improvements</b> to the compiler,
+like compressed snapshot maps or generic handling of calls to helper
+methods. These are of lesser importance, unless other developments
+elevate their priority.
+</li>
+<li>
+<b>Documentation</b> about the <b>internals</b> of LuaJIT is still sorely
+missing. Although the source code is included and is IMHO well
+commented, many basic design decisions are in need of an explanation.
+The rather un-traditional compiler architecture and the many highly
+optimized data structures are a barrier for outside participation in
+the development. Alas, as I've repeatedly stated, I'm better at
+writing code than papers and I'm not in need of any academical merits.
+Someday I will find the time for it. :-)
+</li>
+<li>
+Producing good code for unbiased branches is a key problem for trace
+compilers. This is the main cause for "trace explosion".
+<b>Hyperblock scheduling</b> promises to solve this nicely at the
+price of a major redesign of the compiler. This would also pave the
+way for emitting predicated instructions, which is a prerequisite
+for efficient <b>vectorization</b>.
+</li>
+<li>
+Currently Lua is missing a standard library for access to <b>structured
+binary data</b> and <b>arrays/buffers</b> holding low-level data types.
+Allowing calls to arbitrary C functions (<b>FFI</b>) would obviate the
+need to write manual bindings. A variety of extension modules is floating
+around, with different scope and capabilities. Alas, none of them has been
+designed with a JIT compiler in mind.
+</li>
+</ul>
+<br class="flush">
+</div>
+<div id="foot">
+<hr class="hide">
+Copyright &copy; 2005-2009 Mike Pall
+<span class="noprint">
+&middot;
+<a href="contact.html">Contact</a>
+</span>
+</div>
+</body>
+</html>

+ 69 - 0
dynasm/dasm_proto.h

@@ -0,0 +1,69 @@
+/*
+** DynASM encoding engine prototypes.
+** Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+** Released under the MIT/X license. See dynasm.lua for full copyright notice.
+*/
+
+#ifndef _DASM_PROTO_H
+#define _DASM_PROTO_H
+
+#include <stddef.h>
+#include <stdarg.h>
+
+#define DASM_IDENT	"DynASM 1.2.1"
+#define DASM_VERSION	10201	/* 1.2.1 */
+
+#ifndef Dst_DECL
+#define Dst_DECL	dasm_State *Dst
+#endif
+
+#ifndef Dst_GET
+#define Dst_GET		(Dst)
+#endif
+
+#ifndef DASM_FDEF
+#define DASM_FDEF	extern
+#endif
+
+
+/* Internal DynASM encoder state. */
+typedef struct dasm_State dasm_State;
+
+/* Action list type. */
+typedef const unsigned char *dasm_ActList;
+
+
+/* Initialize and free DynASM state. */
+DASM_FDEF void dasm_init(Dst_DECL, int maxsection);
+DASM_FDEF void dasm_free(Dst_DECL);
+
+/* Setup global array. Must be called before dasm_setup(). */
+DASM_FDEF void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl);
+
+/* Grow PC label array. Can be called after dasm_setup(), too. */
+DASM_FDEF void dasm_growpc(Dst_DECL, unsigned int maxpc);
+
+/* Setup encoder. */
+DASM_FDEF void dasm_setup(Dst_DECL, dasm_ActList actionlist);
+
+/* Feed encoder with actions. Calls are generated by pre-processor. */
+DASM_FDEF void dasm_put(Dst_DECL, int start, ...);
+
+/* Link sections and return the resulting size. */
+DASM_FDEF int dasm_link(Dst_DECL, size_t *szp);
+
+/* Encode sections into buffer. */
+DASM_FDEF int dasm_encode(Dst_DECL, void *buffer);
+
+/* Get PC label offset. */
+DASM_FDEF int dasm_getpclabel(Dst_DECL, unsigned int pc);
+
+#ifdef DASM_CHECKS
+/* Optional sanity checker to call between isolated encoding steps. */
+DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch);
+#else
+#define dasm_checkstep(a, b)	0
+#endif
+
+
+#endif /* _DASM_PROTO_H */

+ 467 - 0
dynasm/dasm_x86.h

@@ -0,0 +1,467 @@
+/*
+** DynASM x86 encoding engine.
+** Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+** Released under the MIT/X license. See dynasm.lua for full copyright notice.
+*/
+
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define DASM_ARCH		"x86"
+
+#ifndef DASM_EXTERN
+#define DASM_EXTERN(a,b,c,d)	0
+#endif
+
+/* Action definitions. DASM_STOP must be 255. */
+enum {
+  DASM_DISP = 233,
+  DASM_IMM_S, DASM_IMM_B, DASM_IMM_W, DASM_IMM_D, DASM_IMM_WB, DASM_IMM_DB,
+  DASM_VREG, DASM_SPACE, DASM_SETLABEL, DASM_REL_A, DASM_REL_LG, DASM_REL_PC,
+  DASM_IMM_LG, DASM_IMM_PC, DASM_LABEL_LG, DASM_LABEL_PC, DASM_ALIGN,
+  DASM_EXTERN, DASM_ESC, DASM_MARK, DASM_SECTION, DASM_STOP
+};
+
+/* Maximum number of section buffer positions for a single dasm_put() call. */
+#define DASM_MAXSECPOS		25
+
+/* DynASM encoder status codes. Action list offset or number are or'ed in. */
+#define DASM_S_OK		0x00000000
+#define DASM_S_NOMEM		0x01000000
+#define DASM_S_PHASE		0x02000000
+#define DASM_S_MATCH_SEC	0x03000000
+#define DASM_S_RANGE_I		0x11000000
+#define DASM_S_RANGE_SEC	0x12000000
+#define DASM_S_RANGE_LG		0x13000000
+#define DASM_S_RANGE_PC		0x14000000
+#define DASM_S_RANGE_VREG	0x15000000
+#define DASM_S_UNDEF_L		0x21000000
+#define DASM_S_UNDEF_PC		0x22000000
+
+/* Macros to convert positions (8 bit section + 24 bit index). */
+#define DASM_POS2IDX(pos)	((pos)&0x00ffffff)
+#define DASM_POS2BIAS(pos)	((pos)&0xff000000)
+#define DASM_SEC2POS(sec)	((sec)<<24)
+#define DASM_POS2SEC(pos)	((pos)>>24)
+#define DASM_POS2PTR(D, pos)	(D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
+
+/* Per-section structure. */
+typedef struct dasm_Section {
+  int *rbuf;		/* Biased buffer pointer (negative section bias). */
+  int *buf;		/* True buffer pointer. */
+  size_t bsize;		/* Buffer size in bytes. */
+  int pos;		/* Biased buffer position. */
+  int epos;		/* End of biased buffer position - max single put. */
+  int ofs;		/* Byte offset into section. */
+} dasm_Section;
+
+/* Core structure holding the DynASM encoding state. */
+struct dasm_State {
+  size_t psize;			/* Allocated size of this structure. */
+  dasm_ActList actionlist;	/* Current actionlist pointer. */
+  int *lglabels;		/* Local/global chain/pos ptrs. */
+  size_t lgsize;
+  int *pclabels;		/* PC label chains/pos ptrs. */
+  size_t pcsize;
+  void **globals;		/* Array of globals (bias -10). */
+  dasm_Section *section;	/* Pointer to active section. */
+  size_t codesize;		/* Total size of all code sections. */
+  int maxsection;		/* 0 <= sectionidx < maxsection. */
+  int status;			/* Status code. */
+  dasm_Section sections[1];	/* All sections. Alloc-extended. */
+};
+
+/* The size of the core structure depends on the max. number of sections. */
+#define DASM_PSZ(ms)	(sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section))
+
+
+/* Initialize DynASM state. */
+void dasm_init(Dst_DECL, int maxsection)
+{
+  dasm_State *D;
+  size_t psz = 0;
+  int i;
+  Dst_REF = NULL;
+  DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
+  D = Dst_REF;
+  D->psize = psz;
+  D->lglabels = NULL;
+  D->lgsize = 0;
+  D->pclabels = NULL;
+  D->pcsize = 0;
+  D->globals = NULL;
+  D->maxsection = maxsection;
+  for (i = 0; i < maxsection; i++) {
+    D->sections[i].buf = NULL;  /* Need this for pass3. */
+    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
+    D->sections[i].bsize = 0;
+    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
+  }
+}
+
+/* Free DynASM state. */
+void dasm_free(Dst_DECL)
+{
+  dasm_State *D = Dst_REF;
+  int i;
+  for (i = 0; i < D->maxsection; i++)
+    if (D->sections[i].buf)
+      DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
+  if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
+  if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
+  DASM_M_FREE(Dst, D, D->psize);
+}
+
+/* Setup global label array. Must be called before dasm_setup(). */
+void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
+{
+  dasm_State *D = Dst_REF;
+  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
+  DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
+}
+
+/* Grow PC label array. Can be called after dasm_setup(), too. */
+void dasm_growpc(Dst_DECL, unsigned int maxpc)
+{
+  dasm_State *D = Dst_REF;
+  size_t osz = D->pcsize;
+  DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int));
+  memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz);
+}
+
+/* Setup encoder. */
+void dasm_setup(Dst_DECL, dasm_ActList actionlist)
+{
+  dasm_State *D = Dst_REF;
+  int i;
+  D->actionlist = actionlist;
+  D->status = DASM_S_OK;
+  D->section = &D->sections[0];
+  memset((void *)D->lglabels, 0, D->lgsize);
+  if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
+  for (i = 0; i < D->maxsection; i++) {
+    D->sections[i].pos = DASM_SEC2POS(i);
+    D->sections[i].ofs = 0;
+  }
+}
+
+
+#ifdef DASM_CHECKS
+#define CK(x, st) \
+  do { if (!(x)) { \
+    D->status = DASM_S_##st|(p-D->actionlist-1); return; } } while (0)
+#define CKPL(kind, st) \
+  do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
+    D->status = DASM_S_RANGE_##st|(p-D->actionlist-1); return; } } while (0)
+#else
+#define CK(x, st)	((void)0)
+#define CKPL(kind, st)	((void)0)
+#endif
+
+/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
+void dasm_put(Dst_DECL, int start, ...)
+{
+  va_list ap;
+  dasm_State *D = Dst_REF;
+  dasm_ActList p = D->actionlist + start;
+  dasm_Section *sec = D->section;
+  int pos = sec->pos, ofs = sec->ofs, mrm = 4;
+  int *b;
+
+  if (pos >= sec->epos) {
+    DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
+      sec->bsize + 2*DASM_MAXSECPOS*sizeof(int));
+    sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
+    sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
+  }
+
+  b = sec->rbuf;
+  b[pos++] = start;
+
+  va_start(ap, start);
+  while (1) {
+    int action = *p++;
+    if (action < DASM_DISP) {
+      ofs++;
+    } else if (action <= DASM_REL_A) {
+      int n = va_arg(ap, int);
+      b[pos++] = n;
+      switch (action) {
+      case DASM_DISP:
+	if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; }
+      case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob;
+      case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */
+      case DASM_IMM_D: ofs += 4; break;
+      case DASM_IMM_S: CK(((n+128)&-256) == 0, RANGE_I); goto ob;
+      case DASM_IMM_B: CK((n&-256) == 0, RANGE_I); ob: ofs++; break;
+      case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob;
+      case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break;
+      case DASM_SPACE: p++; ofs += n; break;
+      case DASM_SETLABEL: b[pos-2] = -0x40000000; break;  /* Neg. label ofs. */
+      case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG);
+	if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue;
+      }
+      mrm = 4;
+    } else {
+      int *pl, n;
+      switch (action) {
+      case DASM_REL_LG:
+      case DASM_IMM_LG:
+	n = *p++; pl = D->lglabels + n;
+	if (n <= 246) { CKPL(lg, LG); goto putrel; }  /* Bkwd rel or global. */
+	pl -= 246; n = *pl;
+	if (n < 0) n = 0;  /* Start new chain for fwd rel if label exists. */
+	goto linkrel;
+      case DASM_REL_PC:
+      case DASM_IMM_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
+      putrel:
+	n = *pl;
+	if (n < 0) {  /* Label exists. Get label pos and store it. */
+	  b[pos] = -n;
+	} else {
+      linkrel:
+	  b[pos] = n;  /* Else link to rel chain, anchored at label. */
+	  *pl = pos;
+	}
+	pos++;
+	ofs += 4;  /* Maximum offset needed. */
+	if (action == DASM_REL_LG || action == DASM_REL_PC)
+	  b[pos++] = ofs;  /* Store pass1 offset estimate. */
+	break;
+      case DASM_LABEL_LG: pl = D->lglabels + *p++; CKPL(lg, LG); goto putlabel;
+      case DASM_LABEL_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
+      putlabel:
+	n = *pl;  /* n > 0: Collapse rel chain and replace with label pos. */
+	while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos; }
+	*pl = -pos;  /* Label exists now. */
+	b[pos++] = ofs;  /* Store pass1 offset estimate. */
+	break;
+      case DASM_ALIGN:
+	ofs += *p++;  /* Maximum alignment needed (arg is 2**n-1). */
+	b[pos++] = ofs;  /* Store pass1 offset estimate. */
+	break;
+      case DASM_EXTERN: p += 2; ofs += 4; break;
+      case DASM_ESC: p++; ofs++; break;
+      case DASM_MARK: mrm = p[-2]; break;
+      case DASM_SECTION:
+	n = *p; CK(n < D->maxsection, RANGE_SEC); D->section = &D->sections[n];
+      case DASM_STOP: goto stop;
+      }
+    }
+  }
+stop:
+  va_end(ap);
+  sec->pos = pos;
+  sec->ofs = ofs;
+}
+#undef CK
+
+/* Pass 2: Link sections, shrink branches/aligns, fix label offsets. */
+int dasm_link(Dst_DECL, size_t *szp)
+{
+  dasm_State *D = Dst_REF;
+  int secnum;
+  int ofs = 0;
+
+#ifdef DASM_CHECKS
+  *szp = 0;
+  if (D->status != DASM_S_OK) return D->status;
+  {
+    int pc;
+    for (pc = 0; pc*sizeof(int) < D->pcsize; pc++)
+      if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
+  }
+#endif
+
+  { /* Handle globals not defined in this translation unit. */
+    int idx;
+    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
+      int n = D->lglabels[idx];
+      /* Undefined label: Collapse rel chain and replace with marker (< 0). */
+      while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
+    }
+  }
+
+  /* Combine all code sections. No support for data sections (yet). */
+  for (secnum = 0; secnum < D->maxsection; secnum++) {
+    dasm_Section *sec = D->sections + secnum;
+    int *b = sec->rbuf;
+    int pos = DASM_SEC2POS(secnum);
+    int lastpos = sec->pos;
+
+    while (pos != lastpos) {
+      dasm_ActList p = D->actionlist + b[pos++];
+      while (1) {
+	int op, action = *p++;
+	switch (action) {
+	case DASM_REL_LG: p++; op = p[-3]; goto rel_pc;
+	case DASM_REL_PC: op = p[-2]; rel_pc: {
+	  int shrink = op == 0xe9 ? 3 : ((op&0xf0) == 0x80 ? 4 : 0);
+	  if (shrink) {  /* Shrinkable branch opcode? */
+	    int lofs, lpos = b[pos];
+	    if (lpos < 0) goto noshrink;  /* Ext global? */
+	    lofs = *DASM_POS2PTR(D, lpos);
+	    if (lpos > pos) {  /* Fwd label: add cumulative section offsets. */
+	      int i;
+	      for (i = secnum; i < DASM_POS2SEC(lpos); i++)
+		lofs += D->sections[i].ofs;
+	    } else {
+	      lofs -= ofs;  /* Bkwd label: unfix offset. */
+	    }
+	    lofs -= b[pos+1];  /* Short branch ok? */
+	    if (lofs >= -128-shrink && lofs <= 127) ofs -= shrink;  /* Yes. */
+	    else { noshrink: shrink = 0; }  /* No, cannot shrink op. */
+	  }
+	  b[pos+1] = shrink;
+	  pos += 2;
+	  break;
+	}
+	case DASM_SPACE: case DASM_IMM_LG: case DASM_VREG: p++;
+	case DASM_DISP: case DASM_IMM_S: case DASM_IMM_B: case DASM_IMM_W:
+	case DASM_IMM_D: case DASM_IMM_WB: case DASM_IMM_DB:
+	case DASM_SETLABEL: case DASM_REL_A: case DASM_IMM_PC: pos++; break;
+	case DASM_LABEL_LG: p++;
+	case DASM_LABEL_PC: b[pos++] += ofs; break; /* Fix label offset. */
+	case DASM_ALIGN: ofs -= (b[pos++]+ofs)&*p++; break; /* Adjust ofs. */
+	case DASM_EXTERN: p += 2; break;
+	case DASM_ESC: p++; break;
+	case DASM_MARK: break;
+	case DASM_SECTION: case DASM_STOP: goto stop;
+	}
+      }
+      stop: (void)0;
+    }
+    ofs += sec->ofs;  /* Next section starts right after current section. */
+  }
+
+  D->codesize = ofs;  /* Total size of all code sections */
+  *szp = ofs;
+  return DASM_S_OK;
+}
+
+#define dasmb(x)	*cp++ = (unsigned char)(x)
+#ifndef DASM_ALIGNED_WRITES
+#define dasmw(x) \
+  do { *((unsigned short *)cp) = (unsigned short)(x); cp+=2; } while (0)
+#define dasmd(x) \
+  do { *((unsigned int *)cp) = (unsigned int)(x); cp+=4; } while (0)
+#else
+#define dasmw(x)	do { dasmb(x); dasmb((x)>>8); } while (0)
+#define dasmd(x)	do { dasmw(x); dasmw((x)>>16); } while (0)
+#endif
+
+/* Pass 3: Encode sections. */
+int dasm_encode(Dst_DECL, void *buffer)
+{
+  dasm_State *D = Dst_REF;
+  unsigned char *base = (unsigned char *)buffer;
+  unsigned char *cp = base;
+  int secnum;
+
+  /* Encode all code sections. No support for data sections (yet). */
+  for (secnum = 0; secnum < D->maxsection; secnum++) {
+    dasm_Section *sec = D->sections + secnum;
+    int *b = sec->buf;
+    int *endb = sec->rbuf + sec->pos;
+
+    while (b != endb) {
+      dasm_ActList p = D->actionlist + *b++;
+      unsigned char *mark = NULL;
+      while (1) {
+	int action = *p++;
+	int n = (action >= DASM_DISP && action <= DASM_ALIGN) ? *b++ : 0;
+	switch (action) {
+	case DASM_DISP: if (!mark) mark = cp; {
+	  unsigned char *mm = mark;
+	  if (*p != DASM_IMM_DB && *p != DASM_IMM_WB) mark = NULL;
+	  if (n == 0) { int mrm = mm[-1]&7; if (mrm == 4) mrm = mm[0]&7;
+	    if (mrm != 5) { mm[-1] -= 0x80; break; } }
+	  if (((n+128) & -256) != 0) goto wd; else mm[-1] -= 0x40;
+	}
+	case DASM_IMM_S: case DASM_IMM_B: wb: dasmb(n); break;
+	case DASM_IMM_DB: if (((n+128)&-256) == 0) {
+	    db: if (!mark) mark = cp; mark[-2] += 2; mark = NULL; goto wb;
+	  } else mark = NULL;
+	case DASM_IMM_D: wd: dasmd(n); break;
+	case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
+	case DASM_IMM_W: dasmw(n); break;
+	case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; }
+	case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
+	  b++; n = (int)(ptrdiff_t)D->globals[-n];
+	case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
+	case DASM_REL_PC: rel_pc: {
+	  int shrink = *b++;
+	  int *pb = DASM_POS2PTR(D, n); if (*pb < 0) { n = pb[1]; goto rel_a; }
+	  n = *pb - ((int)(cp-base) + 4-shrink);
+	  if (shrink == 0) goto wd;
+	  if (shrink == 4) { cp--; cp[-1] = *cp-0x10; } else cp[-1] = 0xeb;
+	  goto wb;
+	}
+	case DASM_IMM_LG:
+	  p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; }
+	case DASM_IMM_PC: {
+	  int *pb = DASM_POS2PTR(D, n);
+	  n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base);
+	  goto wd;
+	}
+	case DASM_LABEL_LG: {
+	  int idx = *p++;
+	  if (idx >= 10)
+	    D->globals[idx] = (void *)(base + (*p == DASM_SETLABEL ? *b : n));
+	  break;
+	}
+	case DASM_LABEL_PC: case DASM_SETLABEL: break;
+	case DASM_SPACE: { int fill = *p++; while (n--) *cp++ = fill; break; }
+	case DASM_ALIGN:
+	  n = *p++;
+	  while (((cp-base) & n)) *cp++ = 0x90; /* nop */
+	  break;
+	case DASM_EXTERN: n = DASM_EXTERN(Dst, cp, p[1], *p); p += 2; goto wd;
+	case DASM_MARK: mark = cp; break;
+	case DASM_ESC: action = *p++;
+	default: *cp++ = action; break;
+	case DASM_SECTION: case DASM_STOP: goto stop;
+	}
+      }
+      stop: (void)0;
+    }
+  }
+
+  if (base + D->codesize != cp)  /* Check for phase errors. */
+    return DASM_S_PHASE;
+  return DASM_S_OK;
+}
+
+/* Get PC label offset. */
+int dasm_getpclabel(Dst_DECL, unsigned int pc)
+{
+  dasm_State *D = Dst_REF;
+  if (pc*sizeof(int) < D->pcsize) {
+    int pos = D->pclabels[pc];
+    if (pos < 0) return *DASM_POS2PTR(D, -pos);
+    if (pos > 0) return -1;  /* Undefined. */
+  }
+  return -2;  /* Unused or out of range. */
+}
+
+#ifdef DASM_CHECKS
+/* Optional sanity checker to call between isolated encoding steps. */
+int dasm_checkstep(Dst_DECL, int secmatch)
+{
+  dasm_State *D = Dst_REF;
+  if (D->status == DASM_S_OK) {
+    int i;
+    for (i = 1; i <= 9; i++) {
+      if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_L|i; break; }
+      D->lglabels[i] = 0;
+    }
+  }
+  if (D->status == DASM_S_OK && secmatch >= 0 &&
+      D->section != &D->sections[secmatch])
+    D->status = DASM_S_MATCH_SEC|(D->section-D->sections);
+  return D->status;
+}
+#endif
+

+ 1799 - 0
dynasm/dasm_x86.lua

@@ -0,0 +1,1799 @@
+------------------------------------------------------------------------------
+-- DynASM x86 module.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- See dynasm.lua for full copyright notice.
+------------------------------------------------------------------------------
+
+-- Module information:
+local _info = {
+  arch =	"x86",
+  description =	"DynASM x86 (i386) module",
+  version =	"1.2.1",
+  vernum =	 10201,
+  release =	"2009-04-16",
+  author =	"Mike Pall",
+  license =	"MIT",
+}
+
+-- Exported glue functions for the arch-specific module.
+local _M = { _info = _info }
+
+-- Cache library functions.
+local type, tonumber, pairs, ipairs = type, tonumber, pairs, ipairs
+local assert, unpack = assert, unpack
+local _s = string
+local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
+local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
+local concat, sort = table.concat, table.sort
+local char, unpack = string.char, unpack
+
+-- Inherited tables and callbacks.
+local g_opt, g_arch
+local wline, werror, wfatal, wwarn
+
+-- Action name list.
+-- CHECK: Keep this in sync with the C code!
+local action_names = {
+  -- int arg, 1 buffer pos:
+  "DISP",  "IMM_S", "IMM_B", "IMM_W", "IMM_D",  "IMM_WB", "IMM_DB",
+  -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
+  "VREG", "SPACE",
+  -- ptrdiff_t arg, 1 buffer pos (address): !x64
+  "SETLABEL", "REL_A",
+  -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
+  "REL_LG", "REL_PC",
+  -- action arg (1 byte) or int arg, 1 buffer pos (link):
+  "IMM_LG", "IMM_PC",
+  -- action arg (1 byte) or int arg, 1 buffer pos (offset):
+  "LABEL_LG", "LABEL_PC",
+  -- action arg (1 byte), 1 buffer pos (offset):
+  "ALIGN",
+  -- action args (2 bytes), no buffer pos.
+  "EXTERN",
+  -- action arg (1 byte), no buffer pos.
+  "ESC",
+  -- no action arg, no buffer pos.
+  "MARK",
+  -- action arg (1 byte), no buffer pos, terminal action:
+  "SECTION",
+  -- no args, no buffer pos, terminal action:
+  "STOP"
+}
+
+-- Maximum number of section buffer positions for dasm_put().
+-- CHECK: Keep this in sync with the C code!
+local maxsecpos = 25 -- Keep this low, to avoid excessively long C lines.
+
+-- Action name -> action number (dynamically generated below).
+local map_action = {}
+-- First action number. Everything below does not need to be escaped.
+local actfirst = 256-#action_names
+
+-- Action list buffer and string (only used to remove dupes).
+local actlist = {}
+local actstr = ""
+
+-- Argument list for next dasm_put(). Start with offset 0 into action list.
+local actargs = { 0 }
+
+-- Current number of section buffer positions for dasm_put().
+local secpos = 1
+
+------------------------------------------------------------------------------
+
+-- Compute action numbers for action names.
+for n,name in ipairs(action_names) do
+  local num = actfirst + n - 1
+  map_action[name] = num
+end
+
+-- Dump action names and numbers.
+local function dumpactions(out)
+  out:write("DynASM encoding engine action codes:\n")
+  for n,name in ipairs(action_names) do
+    local num = map_action[name]
+    out:write(format("  %-10s %02X  %d\n", name, num, num))
+  end
+  out:write("\n")
+end
+
+-- Write action list buffer as a huge static C array.
+local function writeactions(out, name)
+  local nn = #actlist
+  local last = actlist[nn] or 255
+  actlist[nn] = nil -- Remove last byte.
+  if nn == 0 then nn = 1 end
+  out:write("static const unsigned char ", name, "[", nn, "] = {\n")
+  local s = "  "
+  for n,b in ipairs(actlist) do
+    s = s..b..","
+    if #s >= 75 then
+      assert(out:write(s, "\n"))
+      s = "  "
+    end
+  end
+  out:write(s, last, "\n};\n\n") -- Add last byte back.
+end
+
+------------------------------------------------------------------------------
+
+-- Add byte to action list.
+local function wputxb(n)
+  assert(n >= 0 and n <= 255 and n % 1 == 0, "byte out of range")
+  actlist[#actlist+1] = n
+end
+
+-- Add action to list with optional arg. Advance buffer pos, too.
+local function waction(action, a, num)
+  wputxb(assert(map_action[action], "bad action name `"..action.."'"))
+  if a then actargs[#actargs+1] = a end
+  if a or num then secpos = secpos + (num or 1) end
+end
+
+-- Add call to embedded DynASM C code.
+local function wcall(func, args)
+  wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
+end
+
+-- Delete duplicate action list chunks. A tad slow, but so what.
+local function dedupechunk(offset)
+  local al, as = actlist, actstr
+  local chunk = char(unpack(al, offset+1, #al))
+  local orig = find(as, chunk, 1, true)
+  if orig then
+    actargs[1] = orig-1 -- Replace with original offset.
+    for i=offset+1,#al do al[i] = nil end -- Kill dupe.
+  else
+    actstr = as..chunk
+  end
+end
+
+-- Flush action list (intervening C code or buffer pos overflow).
+local function wflush(term)
+  local offset = actargs[1]
+  if #actlist == offset then return end -- Nothing to flush.
+  if not term then waction("STOP") end -- Terminate action list.
+  dedupechunk(offset)
+  wcall("put", actargs) -- Add call to dasm_put().
+  actargs = { #actlist } -- Actionlist offset is 1st arg to next dasm_put().
+  secpos = 1 -- The actionlist offset occupies a buffer position, too.
+end
+
+-- Put escaped byte.
+local function wputb(n)
+  if n >= actfirst then waction("ESC") end -- Need to escape byte.
+  wputxb(n)
+end
+
+------------------------------------------------------------------------------
+
+-- Global label name -> global label number. With auto assignment on 1st use.
+local next_global = 10
+local map_global = setmetatable({}, { __index = function(t, name)
+  if not match(name, "^[%a_][%w_]*$") then werror("bad global label") end
+  local n = next_global
+  if n > 246 then werror("too many global labels") end
+  next_global = n + 1
+  t[name] = n
+  return n
+end})
+
+-- Dump global labels.
+local function dumpglobals(out, lvl)
+  local t = {}
+  for name, n in pairs(map_global) do t[n] = name end
+  out:write("Global labels:\n")
+  for i=10,next_global-1 do
+    out:write(format("  %s\n", t[i]))
+  end
+  out:write("\n")
+end
+
+-- Write global label enum.
+local function writeglobals(out, prefix)
+  local t = {}
+  for name, n in pairs(map_global) do t[n] = name end
+  out:write("enum {\n")
+  for i=10,next_global-1 do
+    out:write("  ", prefix, t[i], ",\n")
+  end
+  out:write("  ", prefix, "_MAX\n};\n")
+end
+
+-- Write global label names.
+local function writeglobalnames(out, name)
+  local t = {}
+  for name, n in pairs(map_global) do t[n] = name end
+  out:write("static const char *const ", name, "[] = {\n")
+  for i=10,next_global-1 do
+    out:write("  \"", t[i], "\",\n")
+  end
+  out:write("  (const char *)0\n};\n")
+end
+
+------------------------------------------------------------------------------
+
+-- Extern label name -> extern label number. With auto assignment on 1st use.
+local next_extern = -1
+local map_extern = setmetatable({}, { __index = function(t, name)
+  -- No restrictions on the name for now.
+  local n = next_extern
+  if n < -256 then werror("too many extern labels") end
+  next_extern = n - 1
+  t[name] = n
+  return n
+end})
+
+-- Dump extern labels.
+local function dumpexterns(out, lvl)
+  local t = {}
+  for name, n in pairs(map_extern) do t[-n] = name end
+  out:write("Extern labels:\n")
+  for i=1,-next_extern-1 do
+    out:write(format("  %s\n", t[i]))
+  end
+  out:write("\n")
+end
+
+-- Write extern label names.
+local function writeexternnames(out, name)
+  local t = {}
+  for name, n in pairs(map_extern) do t[-n] = name end
+  out:write("static const char *const ", name, "[] = {\n")
+  for i=1,-next_extern-1 do
+    out:write("  \"", t[i], "\",\n")
+  end
+  out:write("  (const char *)0\n};\n")
+end
+
+------------------------------------------------------------------------------
+
+-- Arch-specific maps.
+local map_archdef = {}		-- Ext. register name -> int. name.
+local map_reg_rev = {}		-- Int. register name -> ext. name.
+local map_reg_num = {}		-- Int. register name -> register number.
+local map_reg_opsize = {}	-- Int. register name -> operand size.
+local map_reg_valid_base = {}	-- Int. register name -> valid base register?
+local map_reg_valid_index = {}	-- Int. register name -> valid index register?
+local reg_list = {}		-- Canonical list of int. register names.
+
+local map_type = {}		-- Type name -> { ctype, reg }
+local ctypenum = 0		-- Type number (for _PTx macros).
+
+local addrsize = "d"		-- Size for address operands. !x64
+
+-- Helper function to fill register maps.
+local function mkrmap(sz, cl, names)
+  local cname = format("@%s", sz)
+  reg_list[#reg_list+1] = cname
+  map_archdef[cl] = cname
+  map_reg_rev[cname] = cl
+  map_reg_num[cname] = -1
+  map_reg_opsize[cname] = sz
+  if sz == addrsize then
+    map_reg_valid_base[cname] = true
+    map_reg_valid_index[cname] = true
+  end
+  for n,name in ipairs(names) do
+    local iname = format("@%s%x", sz, n-1)
+    reg_list[#reg_list+1] = iname
+    map_archdef[name] = iname
+    map_reg_rev[iname] = name
+    map_reg_num[iname] = n-1
+    map_reg_opsize[iname] = sz
+    if sz == addrsize then
+      map_reg_valid_base[iname] = true
+      map_reg_valid_index[iname] = true
+    end
+  end
+  reg_list[#reg_list+1] = ""
+end
+
+-- Integer registers (dword, word and byte sized).
+mkrmap("d", "Rd", {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"})
+map_reg_valid_index[map_archdef.esp] = false
+mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
+mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
+map_archdef["Ra"] = "@"..addrsize
+
+-- FP registers (internally tword sized, but use "f" as operand size).
+mkrmap("f", "Rf", {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"})
+
+-- SSE registers (oword sized, but qword and dword accessible).
+mkrmap("o", "xmm", {"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"})
+
+-- Operand size prefixes to codes.
+local map_opsize = {
+  byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t",
+  aword = addrsize,
+}
+
+-- Operand size code to number.
+local map_opsizenum = {
+  b = 1, w = 2, d = 4, q = 8, o = 16, t = 10,
+}
+
+-- Operand size code to name.
+local map_opsizename = {
+  b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword",
+  f = "fpword",
+}
+
+-- Valid index register scale factors.
+local map_xsc = {
+  ["1"] = 0, ["2"] = 1, ["4"] = 2, ["8"] = 3,
+}
+
+-- Condition codes.
+local map_cc = {
+  o = 0, no = 1, b = 2, nb = 3, e = 4, ne = 5, be = 6, nbe = 7,
+  s = 8, ns = 9, p = 10, np = 11, l = 12, nl = 13, le = 14, nle = 15,
+  c = 2, nae = 2, nc = 3, ae = 3, z = 4, nz = 5, na = 6, a = 7,
+  pe = 10, po = 11, nge = 12, ge = 13, ng = 14, g = 15,
+}
+
+
+-- Reverse defines for registers.
+function _M.revdef(s)
+  return gsub(s, "@%w+", map_reg_rev)
+end
+
+-- Dump register names and numbers
+local function dumpregs(out)
+  out:write("Register names, sizes and internal numbers:\n")
+  for _,reg in ipairs(reg_list) do
+    if reg == "" then
+      out:write("\n")
+    else
+      local name = map_reg_rev[reg]
+      local num = map_reg_num[reg]
+      local opsize = map_opsizename[map_reg_opsize[reg]]
+      out:write(format("  %-5s %-8s %s\n", name, opsize,
+		       num < 0 and "(variable)" or num))
+    end
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Put action for label arg (IMM_LG, IMM_PC, REL_LG, REL_PC).
+local function wputlabel(aprefix, imm, num)
+  if type(imm) == "number" then
+    if imm < 0 then
+      waction("EXTERN")
+      wputxb(aprefix == "IMM_" and 0 or 1)
+      imm = -imm-1
+    else
+      waction(aprefix.."LG", nil, num);
+    end
+    wputxb(imm)
+  else
+    waction(aprefix.."PC", imm, num)
+  end
+end
+
+-- Put signed byte or arg.
+local function wputsbarg(n)
+  if type(n) == "number" then
+    if n < -128 or n > 127 then
+      werror("signed immediate byte out of range")
+    end
+    if n < 0 then n = n + 256 end
+    wputb(n)
+  else waction("IMM_S", n) end
+end
+
+-- Put unsigned byte or arg.
+local function wputbarg(n)
+  if type(n) == "number" then
+    if n < 0 or n > 255 then
+      werror("unsigned immediate byte out of range")
+    end
+    wputb(n)
+  else waction("IMM_B", n) end
+end
+
+-- Put unsigned word or arg.
+local function wputwarg(n)
+  if type(n) == "number" then
+    if n < 0 or n > 65535 then
+      werror("unsigned immediate word out of range")
+    end
+    local r = n%256; n = (n-r)/256; wputb(r); wputb(n);
+  else waction("IMM_W", n) end
+end
+
+-- Put signed or unsigned dword or arg.
+local function wputdarg(n)
+  local tn = type(n)
+  if tn == "number" then
+    if n < 0 then n = n + 4294967296 end
+    local r = n%256; n = (n-r)/256; wputb(r);
+    r = n%256; n = (n-r)/256; wputb(r);
+    r = n%256; n = (n-r)/256; wputb(r); wputb(n);
+  elseif tn == "table" then
+    wputlabel("IMM_", n[1], 1)
+  else
+    waction("IMM_D", n)
+  end
+end
+
+-- Put operand-size dependent number or arg (defaults to dword).
+local function wputszarg(sz, n)
+  if not sz or sz == "d" then wputdarg(n)
+  elseif sz == "w" then wputwarg(n)
+  elseif sz == "b" then wputbarg(n)
+  elseif sz == "s" then wputsbarg(n)
+  else werror("bad operand size") end
+end
+
+-- Put multi-byte opcode with operand-size dependent modifications.
+local function wputop(sz, op)
+  local r
+  if sz == "w" then wputb(102) end
+  -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
+  if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
+  if op >= 16777216 then r = op % 16777216 wputb((op-r) / 16777216) op = r end
+  if op >= 65536 then r = op % 65536 wputb((op-r) / 65536) op = r end
+  if op >= 256 then r = op % 256 wputb((op-r) / 256) op = r end
+  if sz == "b" then op = op - 1 end
+  wputb(op)
+end
+
+-- Put ModRM or SIB formatted byte.
+local function wputmodrm(m, s, rm, vs, vrm)
+  assert(m < 4 and s < 8 and rm < 8, "bad modrm operands")
+  wputb(64*m + 8*s + rm)
+end
+
+-- Put ModRM/SIB plus optional displacement.
+local function wputmrmsib(t, imark, s, vsreg)
+  local vreg, vxreg
+  local reg, xreg = t.reg, t.xreg
+  if reg and reg < 0 then reg = 0; vreg = t.vreg end
+  if xreg and xreg < 0 then xreg = 0; vxreg = t.vxreg end
+  if s < 0 then s = 0 end
+
+  -- Register mode.
+  if sub(t.mode, 1, 1) == "r" then
+    wputmodrm(3, s, reg)
+    if vsreg then waction("VREG", vsreg); wputxb(2) end
+    if vreg then waction("VREG", vreg); wputxb(0) end
+    return
+  end
+
+  local disp = t.disp
+  local tdisp = type(disp)
+  -- No base register?
+  if not reg then
+    if xreg then
+      -- Indexed mode with index register only.
+      -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
+      wputmodrm(0, s, 4)
+      if imark then waction("MARK") end
+      if vsreg then waction("VREG", vsreg); wputxb(2) end
+      wputmodrm(t.xsc, xreg, 5)
+      if vxreg then waction("VREG", vxreg); wputxb(3) end
+    else
+      -- Pure displacement.
+      wputmodrm(0, s, 5) -- [disp] -> (0, s, ebp)
+      if imark then waction("MARK") end
+      if vsreg then waction("VREG", vsreg); wputxb(2) end
+    end
+    wputdarg(disp)
+    return
+  end
+
+  local m
+  if tdisp == "number" then -- Check displacement size at assembly time.
+    if disp == 0 and reg ~= 5 then -- [ebp] -> [ebp+0] (in SIB, too)
+      if not vreg then m = 0 end -- Force DISP to allow [Rd(5)] -> [ebp+0]
+    elseif disp >= -128 and disp <= 127 then m = 1
+    else m = 2 end
+  elseif tdisp == "table" then
+    m = 2
+  end
+
+  -- Index register present or esp as base register: need SIB encoding.
+  if xreg or reg == 4 then
+    wputmodrm(m or 2, s, 4) -- ModRM.
+    if m == nil or imark then waction("MARK") end
+    if vsreg then waction("VREG", vsreg); wputxb(2) end
+    wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
+    if vxreg then waction("VREG", vxreg); wputxb(3) end
+    if vreg then waction("VREG", vreg); wputxb(1) end
+  else
+    wputmodrm(m or 2, s, reg) -- ModRM.
+    if (imark and (m == 1 or m == 2)) or
+       (m == nil and (vsreg or vreg)) then waction("MARK") end
+    if vsreg then waction("VREG", vsreg); wputxb(2) end
+    if vreg then waction("VREG", vreg); wputxb(1) end
+  end
+
+  -- Put displacement.
+  if m == 1 then wputsbarg(disp)
+  elseif m == 2 then wputdarg(disp)
+  elseif m == nil then waction("DISP", disp) end
+end
+
+------------------------------------------------------------------------------
+
+-- Return human-readable operand mode string.
+local function opmodestr(op, args)
+  local m = {}
+  for i=1,#args do
+    local a = args[i]
+    m[#m+1] = sub(a.mode, 1, 1)..(a.opsize or "?")
+  end
+  return op.." "..concat(m, ",")
+end
+
+-- Convert number to valid integer or nil.
+local function toint(expr)
+  local n = tonumber(expr)
+  if n then
+    if n % 1 ~= 0 or n < -2147483648 or n > 4294967295 then
+      werror("bad integer number `"..expr.."'")
+    end
+    return n
+  end
+end
+
+-- Parse immediate expression.
+local function immexpr(expr)
+  -- &expr (pointer)
+  if sub(expr, 1, 1) == "&" then
+    return "iPJ", format("(ptrdiff_t)(%s)", sub(expr,2))
+  end
+
+  local prefix = sub(expr, 1, 2)
+  -- =>expr (pc label reference)
+  if prefix == "=>" then
+    return "iJ", sub(expr, 3)
+  end
+  -- ->name (global label reference)
+  if prefix == "->" then
+    return "iJ", map_global[sub(expr, 3)]
+  end
+
+  -- [<>][1-9] (local label reference)
+  local dir, lnum = match(expr, "^([<>])([1-9])$")
+  if dir then -- Fwd: 247-255, Bkwd: 1-9.
+    return "iJ", lnum + (dir == ">" and 246 or 0)
+  end
+
+  local extname = match(expr, "^extern%s+(%S+)$")
+  if extname then
+    return "iJ", map_extern[extname]
+  end
+
+  -- expr (interpreted as immediate)
+  return "iI", expr
+end
+
+-- Parse displacement expression: +-num, +-expr, +-opsize*num
+local function dispexpr(expr)
+  local disp = expr == "" and 0 or toint(expr)
+  if disp then return disp end
+  local c, dispt = match(expr, "^([+-])%s*(.+)$")
+  if c == "+" then
+    expr = dispt
+  elseif not c then
+    werror("bad displacement expression `"..expr.."'")
+  end
+  local opsize, tailops = match(dispt, "^(%w+)%s*%*%s*(.+)$")
+  local ops, imm = map_opsize[opsize], toint(tailops)
+  if ops and imm then
+    if c == "-" then imm = -imm end
+    return imm*map_opsizenum[ops]
+  end
+  local mode, iexpr = immexpr(dispt)
+  if mode == "iJ" then
+    if c == "-" then werror("cannot invert label reference") end
+    return { iexpr }
+  end
+  return expr -- Need to return original signed expression.
+end
+
+-- Parse register or type expression.
+local function rtexpr(expr)
+  if not expr then return end
+  local tname, ovreg = match(expr, "^([%w_]+):(@[%w_]+)$")
+  local tp = map_type[tname or expr]
+  if tp then
+    local reg = ovreg or tp.reg
+    local rnum = map_reg_num[reg]
+    if not rnum then
+      werror("type `"..(tname or expr).."' needs a register override")
+    end
+    if not map_reg_valid_base[reg] then
+      werror("bad base register override `"..(map_reg_rev[reg] or reg).."'")
+    end
+    return reg, rnum, tp
+  end
+  return expr, map_reg_num[expr]
+end
+
+-- Parse operand and return { mode, opsize, reg, xreg, xsc, disp, imm }.
+local function parseoperand(param)
+  local t = {}
+
+  local expr = param
+  local opsize, tailops = match(param, "^(%w+)%s*(.+)$")
+  if opsize then
+    t.opsize = map_opsize[opsize]
+    if t.opsize then expr = tailops end
+  end
+
+  local br = match(expr, "^%[%s*(.-)%s*%]$")
+  repeat
+    if br then
+      t.mode = "xm"
+
+      -- [disp]
+      t.disp = toint(br)
+      if t.disp then
+	t.mode = "xmO"
+	break
+      end
+
+      -- [reg...]
+      local tp
+      local reg, tailr = match(br, "^([@%w_:]+)%s*(.*)$")
+      reg, t.reg, tp = rtexpr(reg)
+      if not t.reg then
+	-- [expr]
+	t.mode = "xmO"
+	t.disp = dispexpr("+"..br)
+	break
+      end
+
+      if t.reg == -1 then
+	t.vreg, tailr = match(tailr, "^(%b())(.*)$")
+	if not t.vreg then werror("bad variable register expression") end
+      end
+
+      -- [xreg*xsc] or [xreg*xsc+-disp] or [xreg*xsc+-expr]
+      local xsc, tailsc = match(tailr, "^%*%s*([1248])%s*(.*)$")
+      if xsc then
+	if not map_reg_valid_index[reg] then
+	  werror("bad index register `"..map_reg_rev[reg].."'")
+	end
+	t.xsc = map_xsc[xsc]
+	t.xreg = t.reg
+	t.vxreg = t.vreg
+	t.reg = nil
+	t.vreg = nil
+	t.disp = dispexpr(tailsc)
+	break
+      end
+      if not map_reg_valid_base[reg] then
+	werror("bad base register `"..map_reg_rev[reg].."'")
+      end
+
+      -- [reg] or [reg+-disp]
+      t.disp = toint(tailr) or (tailr == "" and 0)
+      if t.disp then break end
+
+      -- [reg+xreg...]
+      local xreg, tailx = match(tailr, "^+%s*([@%w_:]+)%s*(.*)$")
+      xreg, t.xreg, tp = rtexpr(xreg)
+      if not t.xreg then
+	-- [reg+-expr]
+	t.disp = dispexpr(tailr)
+	break
+      end
+      if not map_reg_valid_index[xreg] then
+	werror("bad index register `"..map_reg_rev[xreg].."'")
+      end
+
+      if t.xreg == -1 then
+	t.vxreg, tailx = match(tailx, "^(%b())(.*)$")
+	if not t.vxreg then werror("bad variable register expression") end
+      end
+
+      -- [reg+xreg*xsc...]
+      local xsc, tailsc = match(tailx, "^%*%s*([1248])%s*(.*)$")
+      if xsc then
+	t.xsc = map_xsc[xsc]
+	tailx = tailsc
+      end
+
+      -- [...] or [...+-disp] or [...+-expr]
+      t.disp = dispexpr(tailx)
+    else
+      -- imm or opsize*imm
+      local imm = toint(expr)
+      if not imm and sub(expr, 1, 1) == "*" and t.opsize then
+	imm = toint(sub(expr, 2))
+	if imm then
+	  imm = imm * map_opsizenum[t.opsize]
+	  t.opsize = nil
+	end
+      end
+      if imm then
+	if t.opsize then werror("bad operand size override") end
+	local m = "i"
+	if imm == 1 then m = m.."1" end
+	if imm >= 4294967168 and imm <= 4294967295 then imm = imm-4294967296 end
+	if imm >= -128 and imm <= 127 then m = m.."S" end
+	t.imm = imm
+	t.mode = m
+	break
+      end
+
+      local tp
+      local reg, tailr = match(expr, "^([@%w_:]+)%s*(.*)$")
+      reg, t.reg, tp = rtexpr(reg)
+      if t.reg then
+	if t.reg == -1 then
+	  t.vreg, tailr = match(tailr, "^(%b())(.*)$")
+	  if not t.vreg then werror("bad variable register expression") end
+	end
+	-- reg
+	if tailr == "" then
+	  if t.opsize then werror("bad operand size override") end
+	  t.opsize = map_reg_opsize[reg]
+	  if t.opsize == "f" then
+	    t.mode = t.reg == 0 and "fF" or "f"
+	  else
+	    if reg == "@w4" then wwarn("bad idea, try again with `esp'") end
+	    t.mode = t.reg == 0 and "rmR" or (reg == "@b1" and "rmC" or "rm")
+	  end
+	  break
+	end
+
+	-- type[idx], type[idx].field, type->field -> [reg+offset_expr]
+	if not tp then werror("bad operand `"..param.."'") end
+	t.mode = "xm"
+	t.disp = format(tp.ctypefmt, tailr)
+      else
+	t.mode, t.imm = immexpr(expr)
+	if sub(t.mode, -1) == "J" then
+	  if t.opsize and t.opsize ~= addrsize then
+	    werror("bad operand size override")
+	  end
+	  t.opsize = addrsize
+	end
+      end
+    end
+  until true
+  return t
+end
+
+------------------------------------------------------------------------------
+-- x86 Template String Description
+-- ===============================
+--
+-- Each template string is a list of [match:]pattern pairs,
+-- separated by "|". The first match wins. No match means a
+-- bad or unsupported combination of operand modes or sizes.
+--
+-- The match part and the ":" is omitted if the operation has
+-- no operands. Otherwise the first N characters are matched
+-- against the mode strings of each of the N operands.
+--
+-- The mode string for each operand type is (see parseoperand()):
+--   Integer register: "rm", +"R" for eax, ax, al, +"C" for cl
+--   FP register:      "f",  +"F" for st0
+--   Index operand:    "xm", +"O" for [disp] (pure offset)
+--   Immediate:        "i",  +"S" for signed 8 bit, +"1" for 1,
+--                     +"I" for arg, +"P" for pointer
+--   Any:              +"J" for valid jump targets
+--
+-- So a match character "m" (mixed) matches both an integer register
+-- and an index operand (to be encoded with the ModRM/SIB scheme).
+-- But "r" matches only a register and "x" only an index operand
+-- (e.g. for FP memory access operations).
+--
+-- The operand size match string starts right after the mode match
+-- characters and ends before the ":". "dwb" is assumed, if empty.
+-- The effective data size of the operation is matched against this list.
+--
+-- If only the regular "b", "w", "d", "q", "t" operand sizes are
+-- present, then all operands must be the same size. Unspecified sizes
+-- are ignored, but at least one operand must have a size or the pattern
+-- won't match (use the "byte", "word", "dword", "qword", "tword"
+-- operand size overrides. E.g.: mov dword [eax], 1).
+--
+-- If the list has a "1" or "2" prefix, the operand size is taken
+-- from the respective operand and any other operand sizes are ignored.
+-- If the list contains only ".", all operand sizes are ignored.
+-- If the list has a "/" prefix, the concatenated (mixed) operand sizes
+-- are compared to the match.
+--
+-- E.g. "rrdw" matches for either two dword registers or two word
+-- registers. "Fx2dq" matches an st0 operand plus an index operand
+-- pointing to a dword (float) or qword (double).
+--
+-- Every character after the ":" is part of the pattern string:
+--   Hex chars are accumulated to form the opcode (left to right).
+--   "n"       disables the standard opcode mods
+--             (otherwise: -1 for "b", o16 prefix for "w")
+--   "r"/"R"   adds the reg. number from the 1st/2nd operand to the opcode.
+--   "m"/"M"   generates ModRM/SIB from the 1st/2nd operand.
+--             The spare 3 bits are either filled with the last hex digit or
+--             the result from a previous "r"/"R". The opcode is restored.
+--
+-- All of the following characters force a flush of the opcode:
+--   "o"/"O"   stores a pure 32 bit disp (offset) from the 1st/2nd operand.
+--   "S"       stores a signed 8 bit immediate from the last operand.
+--   "U"       stores an unsigned 8 bit immediate from the last operand.
+--   "W"       stores an unsigned 16 bit immediate from the last operand.
+--   "i"       stores an operand sized immediate from the last operand.
+--   "I"       dito, but generates an action code to optionally modify
+--             the opcode (+2) for a signed 8 bit immediate.
+--   "J"       generates one of the REL action codes from the last operand.
+--
+------------------------------------------------------------------------------
+
+-- Template strings for x86 instructions. Ordered by first opcode byte.
+-- Unimplemented opcodes (deliberate omissions) are marked with *.
+local map_op = {
+  -- 00-05: add...
+  -- 06: *push es
+  -- 07: *pop es
+  -- 08-0D: or...
+  -- 0E: *push cs
+  -- 0F: two byte opcode prefix
+  -- 10-15: adc...
+  -- 16: *push ss
+  -- 17: *pop ss
+  -- 18-1D: sbb...
+  -- 1E: *push ds
+  -- 1F: *pop ds
+  -- 20-25: and...
+  es_0 =	"26",
+  -- 27: *daa
+  -- 28-2D: sub...
+  cs_0 =	"2E",
+  -- 2F: *das
+  -- 30-35: xor...
+  ss_0 =	"36",
+  -- 37: *aaa
+  -- 38-3D: cmp...
+  ds_0 =	"3E",
+  -- 3F: *aas
+  inc_1 =	"rdw:40r|m:FF0m",
+  dec_1 =	"rdw:48r|m:FF1m",
+  push_1 =	"rdw:50r|mdw:FF6m|S.:6AS|ib:n6Ai|i.:68i",
+  pop_1 =	"rdw:58r|mdw:8F0m",
+  -- 60: *pusha, *pushad, *pushaw
+  -- 61: *popa, *popad, *popaw
+  -- 62: *bound rdw,x
+  -- 63: *arpl mw,rw
+  fs_0 =	"64",
+  gs_0 =	"65",
+  o16_0 =	"66",
+  a16_0 =	"67",
+  -- 68: push idw
+  -- 69: imul rdw,mdw,idw
+  -- 6A: push ib
+  -- 6B: imul rdw,mdw,S
+  -- 6C: *insb
+  -- 6D: *insd, *insw
+  -- 6E: *outsb
+  -- 6F: *outsd, *outsw
+  -- 70-7F: jcc lb
+  -- 80: add... mb,i
+  -- 81: add... mdw,i
+  -- 82: *undefined
+  -- 83: add... mdw,S
+  test_2 =	"mr:85Rm|rm:85rM|Ri:A9ri|mi:F70mi",
+  -- 86: xchg rb,mb
+  -- 87: xchg rdw,mdw
+  -- 88: mov mb,r
+  -- 89: mov mdw,r
+  -- 8A: mov r,mb
+  -- 8B: mov r,mdw
+  -- 8C: *mov mdw,seg
+  lea_2 =	"rxd:8DrM",
+  -- 8E: *mov seg,mdw
+  -- 8F: pop mdw
+  nop_0 =	"90",
+  xchg_2 =	"Rrdw:90R|rRdw:90r|rm:87rM|mr:87Rm",
+  cbw_0 =	"6698",
+  cwde_0 =	"98",
+  cwd_0 =	"6699",
+  cdq_0 =	"99",
+  -- 9A: *call iw:idw
+  wait_0 =	"9B",
+  fwait_0 =	"9B",
+  pushf_0 =	"9C",
+  pushfw_0 =	"669C",
+  pushfd_0 =	"9C",
+  popf_0 =	"9D",
+  popfw_0 =	"669D",
+  popfd_0 =	"9D",
+  sahf_0 =	"9E",
+  lahf_0 =	"9F",
+  mov_2 =	"OR:A3o|RO:A1O|mr:89Rm|rm:8BrM|rib:nB0ri|ridw:B8ri|mi:C70mi",
+  movsb_0 =	"A4",
+  movsw_0 =	"66A5",
+  movsd_0 =	"A5",
+  cmpsb_0 =	"A6",
+  cmpsw_0 =	"66A7",
+  cmpsd_0 =	"A7",
+  -- A8: test Rb,i
+  -- A9: test Rdw,i
+  stosb_0 =	"AA",
+  stosw_0 =	"66AB",
+  stosd_0 =	"AB",
+  lodsb_0 =	"AC",
+  lodsw_0 =	"66AD",
+  lodsd_0 =	"AD",
+  scasb_0 =	"AE",
+  scasw_0 =	"66AF",
+  scasd_0 =	"AF",
+  -- B0-B7: mov rb,i
+  -- B8-BF: mov rdw,i
+  -- C0: rol... mb,i
+  -- C1: rol... mdw,i
+  ret_1 =	"i.:nC2W",
+  ret_0 =	"C3",
+  -- C4: *les rdw,mq
+  -- C5: *lds rdw,mq
+  -- C6: mov mb,i
+  -- C7: mov mdw,i
+  -- C8: *enter iw,ib
+  leave_0 =	"C9",
+  -- CA: *retf iw
+  -- CB: *retf
+  int3_0 =	"CC",
+  int_1 =	"i.:nCDU",
+  into_0 =	"CE",
+  -- CF: *iret
+  -- D0: rol... mb,1
+  -- D1: rol... mdw,1
+  -- D2: rol... mb,cl
+  -- D3: rol... mb,cl
+  -- D4: *aam ib
+  -- D5: *aad ib
+  -- D6: *salc
+  -- D7: *xlat
+  -- D8-DF: floating point ops
+  -- E0: *loopne
+  -- E1: *loope
+  -- E2: *loop
+  -- E3: *jcxz, *jecxz
+  -- E4: *in Rb,ib
+  -- E5: *in Rdw,ib
+  -- E6: *out ib,Rb
+  -- E7: *out ib,Rdw
+  call_1 =	"md:FF2m|J.:E8J",
+  jmp_1 =	"md:FF4m|J.:E9J", -- short: EB
+  -- EA: *jmp iw:idw
+  -- EB: jmp ib
+  -- EC: *in Rb,dx
+  -- ED: *in Rdw,dx
+  -- EE: *out dx,Rb
+  -- EF: *out dx,Rdw
+  -- F0: *lock
+  int1_0 =	"F1",
+  repne_0 =	"F2",
+  repnz_0 =	"F2",
+  rep_0 =	"F3",
+  repe_0 =	"F3",
+  repz_0 =	"F3",
+  -- F4: *hlt
+  cmc_0 =	"F5",
+  -- F6: test... mb,i; div... mb
+  -- F7: test... mdw,i; div... mdw
+  clc_0 =	"F8",
+  stc_0 =	"F9",
+  -- FA: *cli
+  cld_0 =	"FC",
+  std_0 =	"FD",
+  -- FE: inc... mb
+  -- FF: inc... mdw
+
+  -- misc ops
+  not_1 =	"m:F72m",
+  neg_1 =	"m:F73m",
+  mul_1 =	"m:F74m",
+  imul_1 =	"m:F75m",
+  div_1 =	"m:F76m",
+  idiv_1 =	"m:F77m",
+
+  imul_2 =	"rmdw:0FAFrM|rIdw:69rmI|rSdw:6BrmS|ridw:69rmi",
+  imul_3 =	"rmIdw:69rMI|rmSdw:6BrMS|rmidw:69rMi",
+
+  movzx_2 =	"rm/db:0FB6rM|rm/wb:0FB6rM|rm/dw:0FB7rM",
+  movsx_2 =	"rm/db:0FBErM|rm/wb:0FBErM|rm/dw:0FBFrM",
+
+  bswap_1 =	"rd:0FC8r",
+  bsf_2 =	"rmdw:0FBCrM",
+  bsr_2 =	"rmdw:0FBDrM",
+  bt_2 =	"mrdw:0FA3Rm|midw:0FBA4mU",
+  btc_2 =	"mrdw:0FBBRm|midw:0FBA7mU",
+  btr_2 =	"mrdw:0FB3Rm|midw:0FBA6mU",
+  bts_2 =	"mrdw:0FABRm|midw:0FBA5mU",
+
+  rdtsc_0 =	"0F31", -- P1+
+  cpuid_0 =	"0FA2", -- P1+
+
+  -- floating point ops
+  fst_1 =	"ff:DDD0r|xd:D92m|xq:DD2m",
+  fstp_1 =	"ff:DDD8r|xd:D93m|xq:DD3m|xt:DB7m",
+  fld_1 =	"ff:D9C0r|xd:D90m|xq:DD0m|xt:DB5m",
+
+  fpop_0 =	"DDD8", -- Alias for fstp st0.
+
+  fist_1 =	"xw:nDF2m|xd:DB2m",
+  fistp_1 =	"xw:nDF3m|xd:DB3m|xq:DF7m",
+  fild_1 =	"xw:nDF0m|xd:DB0m|xq:DF5m",
+
+  fxch_0 =	"D9C9",
+  fxch_1 =	"ff:D9C8r",
+  fxch_2 =	"fFf:D9C8r|Fff:D9C8R",
+
+  fucom_1 =	"ff:DDE0r",
+  fucom_2 =	"Fff:DDE0R",
+  fucomp_1 =	"ff:DDE8r",
+  fucomp_2 =	"Fff:DDE8R",
+  fucomi_1 =	"ff:DBE8r", -- P6+
+  fucomi_2 =	"Fff:DBE8R", -- P6+
+  fucomip_1 =	"ff:DFE8r", -- P6+
+  fucomip_2 =	"Fff:DFE8R", -- P6+
+  fcomi_1 =	"ff:DBF0r", -- P6+
+  fcomi_2 =	"Fff:DBF0R", -- P6+
+  fcomip_1 =	"ff:DFF0r", -- P6+
+  fcomip_2 =	"Fff:DFF0R", -- P6+
+  fucompp_0 =	"DAE9",
+  fcompp_0 =	"DED9",
+
+  fldcw_1 =	"xw:nD95m",
+  fstcw_1 =	"xw:n9BD97m",
+  fnstcw_1 =	"xw:nD97m",
+  fstsw_1 =	"Rw:n9BDFE0|xw:n9BDD7m",
+  fnstsw_1 =	"Rw:nDFE0|xw:nDD7m",
+  fclex_0 =	"9BDBE2",
+  fnclex_0 =	"DBE2",
+
+  fnop_0 =	"D9D0",
+  -- D9D1-D9DF: unassigned
+
+  fchs_0 =	"D9E0",
+  fabs_0 =	"D9E1",
+  -- D9E2: unassigned
+  -- D9E3: unassigned
+  ftst_0 =	"D9E4",
+  fxam_0 =	"D9E5",
+  -- D9E6: unassigned
+  -- D9E7: unassigned
+  fld1_0 =	"D9E8",
+  fldl2t_0 =	"D9E9",
+  fldl2e_0 =	"D9EA",
+  fldpi_0 =	"D9EB",
+  fldlg2_0 =	"D9EC",
+  fldln2_0 =	"D9ED",
+  fldz_0 =	"D9EE",
+  -- D9EF: unassigned
+
+  f2xm1_0 =	"D9F0",
+  fyl2x_0 =	"D9F1",
+  fptan_0 =	"D9F2",
+  fpatan_0 =	"D9F3",
+  fxtract_0 =	"D9F4",
+  fprem1_0 =	"D9F5",
+  fdecstp_0 =	"D9F6",
+  fincstp_0 =	"D9F7",
+  fprem_0 =	"D9F8",
+  fyl2xp1_0 =	"D9F9",
+  fsqrt_0 =	"D9FA",
+  fsincos_0 =	"D9FB",
+  frndint_0 =	"D9FC",
+  fscale_0 =	"D9FD",
+  fsin_0 =	"D9FE",
+  fcos_0 =	"D9FF",
+
+  -- SSE, SSE2
+  andnpd_2 =	"rmo:660F55rM",
+  andnps_2 =	"rmo:0F55rM",
+  andpd_2 =	"rmo:660F54rM",
+  andps_2 =	"rmo:0F54rM",
+  clflush_1 =	"x.:0FAE7m",
+  cmppd_3 =	"rmio:660FC2rMU",
+  cmpps_3 =	"rmio:0FC2rMU",
+  cmpsd_3 =	"rmio:F20FC2rMU",
+  cmpss_3 =	"rmio:F30FC2rMU",
+  comisd_2 =	"rmo:660F2FrM",
+  comiss_2 =	"rmo:0F2FrM",
+  cvtdq2pd_2 =	"rro:F30FE6rM|rx/oq:",
+  cvtdq2ps_2 =	"rmo:0F5BrM",
+  cvtpd2dq_2 =	"rmo:F20FE6rM",
+  cvtpd2ps_2 =	"rmo:660F5ArM",
+  cvtpi2pd_2 =	"rx/oq:660F2ArM",
+  cvtpi2ps_2 =	"rx/oq:0F2ArM",
+  cvtps2dq_2 =	"rmo:660F5BrM",
+  cvtps2pd_2 =	"rro:0F5ArM|rx/oq:",
+  cvtsd2si_2 =	"rr/do:F20F2DrM|rx/dq:",
+  cvtsd2ss_2 =	"rro:F20F5ArM|rx/oq:",
+  cvtsi2sd_2 =	"rm/od:F20F2ArM",
+  cvtsi2ss_2 =	"rm/od:F30F2ArM",
+  cvtss2sd_2 =	"rro:F30F5ArM|rx/od:",
+  cvtss2si_2 =	"rr/do:F20F2CrM|rx/dd:",
+  cvttpd2dq_2 =	"rmo:660FE6rM",
+  cvttps2dq_2 =	"rmo:F30F5BrM",
+  cvttsd2si_2 =	"rr/do:F20F2CrM|rx/dq:",
+  cvttss2si_2 =	"rr/do:F30F2CrM|rx/dd:",
+  ldmxcsr_1 =	"xd:0FAE2m",
+  lfence_0 =	"0FAEE8",
+  maskmovdqu_2 = "rro:660FF7rM",
+  mfence_0 =	"0FAEF0",
+  movapd_2 =	"rmo:660F28rM|mro:660F29Rm",
+  movaps_2 =	"rmo:0F28rM|mro:0F29Rm",
+  movd_2 =	"rm/od:660F6ErM|mr/do:660F7ERm",
+  movdqa_2 =	"rmo:660F6FrM|mro:660F7FRm",
+  movdqu_2 =	"rmo:F30F6FrM|mro:F30F7FRm",
+  movhlps_2 =	"rro:0F12rM",
+  movhpd_2 =	"rx/oq:660F16rM|xr/qo:660F17Rm",
+  movhps_2 =	"rx/oq:0F16rM|xr/qo:0F17Rm",
+  movlhps_2 =	"rro:0F16rM",
+  movlpd_2 =	"rx/oq:660F12rM|xr/qo:660F13Rm",
+  movlps_2 =	"rx/oq:0F12rM|xr/qo:0F13Rm",
+  movmskpd_2 =	"rr/do:660F50rM",
+  movmskps_2 =	"rr/do:0F50rM",
+  movntdq_2 =	"xro:660FE7Rm",
+  movnti_2 =	"xrd:0FC3Rm",
+  movntpd_2 =	"xro:660F2BRm",
+  movntps_2 =	"xro:0F2BRm",
+  movq_2 =	"rro:F30F7ErM|rx/oq:|xr/qo:660FD6Rm",
+  movsd_2 =	"rro:F20F10rM|rx/oq:|xr/qo:F20F11Rm",
+  movss_2 =	"rro:F30F10rM|rx/od:|xr/do:F30F11Rm",
+  movupd_2 =	"rmo:660F10rM|mro:660F11Rm",
+  movups_2 =	"rmo:0F10rM|mro:0F11Rm",
+  orpd_2 =	"rmo:660F56rM",
+  orps_2 =	"rmo:0F56rM",
+  packssdw_2 =	"rmo:660F6BrM",
+  packsswb_2 =	"rmo:660F63rM",
+  packuswb_2 =	"rmo:660F67rM",
+  paddb_2 =	"rmo:660FFCrM",
+  paddd_2 =	"rmo:660FFErM",
+  paddq_2 =	"rmo:660FD4rM",
+  paddsb_2 =	"rmo:660FECrM",
+  paddsw_2 =	"rmo:660FEDrM",
+  paddusb_2 =	"rmo:660FDCrM",
+  paddusw_2 =	"rmo:660FDDrM",
+  paddw_2 =	"rmo:660FFDrM",
+  pand_2 =	"rmo:660FDBrM",
+  pandn_2 =	"rmo:660FDFrM",
+  pause_0 =	"F390",
+  pavgb_2 =	"rmo:660FE0rM",
+  pavgw_2 =	"rmo:660FE3rM",
+  pcmpeqb_2 =	"rmo:660F74rM",
+  pcmpeqd_2 =	"rmo:660F76rM",
+  pcmpeqw_2 =	"rmo:660F75rM",
+  pcmpgtb_2 =	"rmo:660F64rM",
+  pcmpgtd_2 =	"rmo:660F66rM",
+  pcmpgtw_2 =	"rmo:660F65rM",
+  pextrw_3 =	"rri/do:660FC5rMU|xri/wo:660F3A15nrMU", -- Mem op: SSE4.1 only.
+  pinsrw_3 =	"rri/od:660FC4rMU|rxi/ow:",
+  pmaddwd_2 =	"rmo:660FF5rM",
+  pmaxsw_2 =	"rmo:660FEErM",
+  pmaxub_2 =	"rmo:660FDErM",
+  pminsw_2 =	"rmo:660FEArM",
+  pminub_2 =	"rmo:660FDArM",
+  pmovmskb_2 =	"rr/do:660FD7rM",
+  pmulhuw_2 =	"rmo:660FE4rM",
+  pmulhw_2 =	"rmo:660FE5rM",
+  pmullw_2 =	"rmo:660FD5rM",
+  pmuludq_2 =	"rmo:660FF4rM",
+  por_2 =	"rmo:660FEBrM",
+  prefetchnta_1 = "xb:n0F180m",
+  prefetcht0_1 = "xb:n0F181m",
+  prefetcht1_1 = "xb:n0F182m",
+  prefetcht2_1 = "xb:n0F183m",
+  psadbw_2 =	"rmo:660FF6rM",
+  pshufd_3 =	"rmio:660F70rMU",
+  pshufhw_3 =	"rmio:F30F70rMU",
+  pshuflw_3 =	"rmio:F20F70rMU",
+  pslld_2 =	"rmo:660FF2rM|rio:660F726mU",
+  pslldq_2 =	"rio:660F737mU",
+  psllq_2 =	"rmo:660FF3rM|rio:660F736mU",
+  psllw_2 =	"rmo:660FF1rM|rio:660F716mU",
+  psrad_2 =	"rmo:660FE2rM|rio:660F724mU",
+  psraw_2 =	"rmo:660FE1rM|rio:660F714mU",
+  psrld_2 =	"rmo:660FD2rM|rio:660F722mU",
+  psrldq_2 =	"rio:660F733mU",
+  psrlq_2 =	"rmo:660FD3rM|rio:660F732mU",
+  psrlw_2 =	"rmo:660FD1rM|rio:660F712mU",
+  psubb_2 =	"rmo:660FF8rM",
+  psubd_2 =	"rmo:660FFArM",
+  psubq_2 =	"rmo:660FFBrM",
+  psubsb_2 =	"rmo:660FE8rM",
+  psubsw_2 =	"rmo:660FE9rM",
+  psubusb_2 =	"rmo:660FD8rM",
+  psubusw_2 =	"rmo:660FD9rM",
+  psubw_2 =	"rmo:660FF9rM",
+  punpckhbw_2 =	"rmo:660F68rM",
+  punpckhdq_2 =	"rmo:660F6ArM",
+  punpckhqdq_2 = "rmo:660F6DrM",
+  punpckhwd_2 =	"rmo:660F69rM",
+  punpcklbw_2 =	"rmo:660F60rM",
+  punpckldq_2 =	"rmo:660F62rM",
+  punpcklqdq_2 = "rmo:660F6CrM",
+  punpcklwd_2 =	"rmo:660F61rM",
+  pxor_2 =	"rmo:660FEFrM",
+  rcpps_2 =	"rmo:0F53rM",
+  rcpss_2 =	"rmo:F30F53rM",
+  rsqrtps_2 =	"rmo:0F52rM",
+  rsqrtss_2 =	"rmo:F30F52rM",
+  sfence_0 =	"0FAEF8",
+  shufpd_3 =	"rmio:660FC6rMU",
+  shufps_3 =	"rmio:0FC6rMU",
+  stmxcsr_1 =   "xd:0FAE3m",
+  ucomisd_2 =	"rmo:660F2ErM",
+  ucomiss_2 =	"rmo:0F2ErM",
+  unpckhpd_2 =	"rmo:660F15rM",
+  unpckhps_2 =	"rmo:0F15rM",
+  unpcklpd_2 =	"rmo:660F14rM",
+  unpcklps_2 =	"rmo:0F14rM",
+  xorpd_2 =	"rmo:660F57rM",
+  xorps_2 =	"rmo:0F57rM",
+
+  -- SSE3 ops
+  fisttp_1 =	"xw:nDF1m|xd:DB1m|xq:DD1m",
+  addsubpd_2 =	"rmo:660FD0rM",
+  addsubps_2 =	"rmo:F20FD0rM",
+  haddpd_2 =	"rmo:660F7CrM",
+  haddps_2 =	"rmo:F20F7CrM",
+  hsubpd_2 =	"rmo:660F7DrM",
+  hsubps_2 =	"rmo:F20F7DrM",
+  lddqu_2 =	"rxo:F20FF0rM",
+  movddup_2 =	"rmo:F20F12rM",
+  movshdup_2 =	"rmo:F30F16rM",
+  movsldup_2 =	"rmo:F30F12rM",
+
+  -- SSSE3 ops
+  pabsb_2 =	"rmo:660F381CrM",
+  pabsd_2 =	"rmo:660F381ErM",
+  pabsw_2 =	"rmo:660F381DrM",
+  palignr_3 =	"rmio:660F3A0FrMU",
+  phaddd_2 =	"rmo:660F3802rM",
+  phaddsw_2 =	"rmo:660F3803rM",
+  phaddw_2 =	"rmo:660F3801rM",
+  phsubd_2 =	"rmo:660F3806rM",
+  phsubsw_2 =	"rmo:660F3807rM",
+  phsubw_2 =	"rmo:660F3805rM",
+  pmaddubsw_2 =	"rmo:660F3804rM",
+  pmulhrsw_2 =	"rmo:660F380BrM",
+  pshufb_2 =	"rmo:660F3800rM",
+  psignb_2 =	"rmo:660F3808rM",
+  psignd_2 =	"rmo:660F380ArM",
+  psignw_2 =	"rmo:660F3809rM",
+
+  -- SSE4.1 ops
+  blendpd_3 =	"rmio:660F3A0DrMU",
+  blendps_3 =	"rmio:660F3A0CrMU",
+  blendvpd_3 =	"rmRo:660F3815rM",
+  blendvps_3 =	"rmRo:660F3814rM",
+  dppd_3 =	"rmio:660F3A41rMU",
+  dpps_3 =	"rmio:660F3A40rMU",
+  extractps_3 =	"mri/do:660F3A17RmU",
+  insertps_3 =	"rrio:660F3A41rMU|rxi/od:",
+  movntdqa_2 =	"rmo:660F382ArM",
+  mpsadbw_3 =	"rmio:660F3A42rMU",
+  packusdw_2 =	"rmo:660F382BrM",
+  pblendvb_3 =	"rmRo:660F3810rM",
+  pblendw_3 =	"rmio:660F3A0ErMU",
+  pcmpeqq_2 =	"rmo:660F3829rM",
+  pextrb_3 =	"rri/do:660F3A14nRmU|xri/bo:",
+  pextrd_3 =	"mri/do:660F3A16RmU",
+  -- x64: pextrq
+  -- pextrw is SSE2, mem operand is SSE4.1 only
+  phminposuw_2 = "rmo:660F3841rM",
+  pinsrb_3 =  "rri/od:660F3A20nrMU|rxi/ob:",
+  pinsrd_3 =  "rmi/od:660F3A22rMU",
+  -- x64: pinsrq
+  pmaxsb_2 =	"rmo:660F383CrM",
+  pmaxsd_2 =	"rmo:660F383DrM",
+  pmaxud_2 =	"rmo:660F383FrM",
+  pmaxuw_2 =	"rmo:660F383ErM",
+  pminsb_2 =	"rmo:660F3838rM",
+  pminsd_2 =	"rmo:660F3839rM",
+  pminud_2 =	"rmo:660F383BrM",
+  pminuw_2 =	"rmo:660F383ArM",
+  pmovsxbd_2 =	"rro:660F3821rM|rx/od:",
+  pmovsxbq_2 =	"rro:660F3822rM|rx/ow:",
+  pmovsxbw_2 =	"rro:660F3820rM|rx/oq:",
+  pmovsxdq_2 =	"rro:660F3825rM|rx/oq:",
+  pmovsxwd_2 =	"rro:660F3823rM|rx/oq:",
+  pmovsxwq_2 =	"rro:660F3824rM|rx/od:",
+  pmovzxbd_2 =	"rro:660F3831rM|rx/od:",
+  pmovzxbq_2 =	"rro:660F3832rM|rx/ow:",
+  pmovzxbw_2 =	"rro:660F3830rM|rx/oq:",
+  pmovzxdq_2 =	"rro:660F3835rM|rx/oq:",
+  pmovzxwd_2 =	"rro:660F3833rM|rx/oq:",
+  pmovzxwq_2 =	"rro:660F3834rM|rx/od:",
+  pmuldq_2 =	"rmo:660F3828rM",
+  pmulld_2 =	"rmo:660F3840rM",
+  ptest_2 =	"rmo:660F3817rM",
+  roundpd_3 =	"rmio:660F3A09rMU",
+  roundps_3 =	"rmio:660F3A08rMU",
+  roundsd_3 =	"rrio:660F3A0BrMU|rxi/oq:",
+  roundss_3 =	"rrio:660F3A0ArMU|rxi/od:",
+
+  -- SSE4.2 ops
+  crc32_2 =	"rmd:F20F38F1rM|rm/dw:66F20F38F1rM|rm/db:F20F38F0nrM",
+  pcmpestri_3 =	"rmio:660F3A61rMU",
+  pcmpestrm_3 =	"rmio:660F3A60rMU",
+  pcmpgtq_2 =	"rmo:660F3837rM",
+  pcmpistri_3 =	"rmio:660F3A63rMU",
+  pcmpistrm_3 =	"rmio:660F3A62rMU",
+  popcnt_2 =	"rmdw:F30FB8rM",
+
+  -- SSE4a
+  extrq_2 =	"rro:660F79rM",
+  extrq_3 =	"riio:660F780mUU",
+  insertq_2 =	"rro:F20F79rM",
+  insertq_4 =	"rriio:F20F78rMUU",
+  lzcnt_2 =	"rmdw:F30FBDrM",
+  movntsd_2 =	"xr/qo:F20F2BRm",
+  movntss_2 =	"xr/do:F30F2BRm",
+  -- popcnt is also in SSE4.2
+}
+
+------------------------------------------------------------------------------
+
+-- Arithmetic ops.
+for name,n in pairs{ add = 0, ["or"] = 1, adc = 2, sbb = 3,
+		     ["and"] = 4, sub = 5, xor = 6, cmp = 7 } do
+  local n8 = n * 8
+  map_op[name.."_2"] = format(
+    "mr:%02XRm|rm:%02XrM|mI1dw:81%XmI|mS1dw:83%XmS|Ri1dwb:%02Xri|mi1dwb:81%Xmi",
+    1+n8, 3+n8, n, n, 5+n8, n)
+end
+
+-- Shift ops.
+for name,n in pairs{ rol = 0, ror = 1, rcl = 2, rcr = 3,
+		     shl = 4, shr = 5,          sar = 7, sal = 4 } do
+  map_op[name.."_2"] = format("m1:D1%Xm|mC1dwb:D3%Xm|mi:C1%XmU", n, n, n)
+end
+
+-- Conditional ops.
+for cc,n in pairs(map_cc) do
+  map_op["j"..cc.."_1"] = format("J.:0F8%XJ", n) -- short: 7%X
+  map_op["set"..cc.."_1"] = format("mb:n0F9%X2m", n)
+  map_op["cmov"..cc.."_2"] = format("rmdw:0F4%XrM", n) -- P6+
+end
+
+-- FP arithmetic ops.
+for name,n in pairs{ add = 0, mul = 1, com = 2, comp = 3,
+		     sub = 4, subr = 5, div = 6, divr = 7 } do
+  local nc = 192 + n * 8
+  local nr = nc + (n < 4 and 0 or (n % 2 == 0 and 8 or -8))
+  local fn = "f"..name
+  map_op[fn.."_1"] = format("ff:D8%02Xr|xd:D8%Xm|xq:DC%Xm", nc, n, n)
+  if n == 2 or n == 3 then
+    map_op[fn.."_2"] = format("Fff:D8%02XR|Fx2d:D8%XM|Fx2q:DC%XM", nc, n, n)
+  else
+    map_op[fn.."_2"] = format("Fff:D8%02XR|fFf:DC%02Xr|Fx2d:D8%XM|Fx2q:DC%XM", nc, nr, n, n)
+    map_op[fn.."p_1"] = format("ff:DE%02Xr", nr)
+    map_op[fn.."p_2"] = format("fFf:DE%02Xr", nr)
+  end
+  map_op["fi"..name.."_1"] = format("xd:DA%Xm|xw:nDE%Xm", n, n)
+end
+
+-- FP conditional moves.
+for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
+  local n4 = n % 4
+  local nc = 56000 + n4 * 8 + (n-n4) * 64
+  map_op["fcmov"..cc.."_1"] = format("ff:%04Xr", nc) -- P6+
+  map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
+end
+
+-- SSE FP arithmetic ops.
+for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
+		     sub = 12, min = 13, div = 14, max = 15 } do
+  map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
+  map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
+  map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
+  map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
+end
+
+------------------------------------------------------------------------------
+
+-- Process pattern string.
+local function dopattern(pat, args, sz, op)
+  local digit, addin
+  local opcode = 0
+  local szov = sz
+  local narg = 1
+
+  -- Limit number of section buffer positions used by a single dasm_put().
+  -- A single opcode needs a maximum of 2 positions. !x64
+  if secpos+2 > maxsecpos then wflush() end
+
+  -- Process each character.
+  for c in gmatch(pat.."|", ".") do
+    if match(c, "%x") then	-- Hex digit.
+      digit = byte(c) - 48
+      if digit > 48 then digit = digit - 39
+      elseif digit > 16 then digit = digit - 7 end
+      opcode = opcode*16 + digit
+      addin = nil
+    elseif c == "n" then	-- Disable operand size mods for opcode.
+      szov = nil
+    elseif c == "r" then	-- Merge 1st operand regno. into opcode.
+      addin = args[1]; opcode = opcode + addin.reg
+      if narg < 2 then narg = 2 end
+    elseif c == "R" then	-- Merge 2nd operand regno. into opcode.
+      addin = args[2]; opcode = opcode + addin.reg
+      narg = 3
+    elseif c == "m" or c == "M" then	-- Encode ModRM/SIB.
+      local s
+      if addin then
+	s = addin.reg
+	opcode = opcode - s	-- Undo regno opcode merge.
+      else
+	s = opcode % 16		-- Undo last digit.
+	opcode = (opcode - s) / 16
+      end
+      wputop(szov, opcode); opcode = nil
+      local imark = (sub(pat, -1) == "I") -- Force a mark (ugly).
+      -- Put ModRM/SIB with regno/last digit as spare.
+      local nn = c == "m" and 1 or 2
+      wputmrmsib(args[nn], imark, s, addin and addin.vreg)
+      if narg <= nn then narg = nn + 1 end
+      addin = nil
+    else
+      if opcode then -- Flush opcode.
+	if addin and addin.reg == -1 then
+	  wputop(szov, opcode + 1)
+	  waction("VREG", addin.vreg); wputxb(0)
+	else
+	  wputop(szov, opcode)
+	end
+	opcode = nil
+      end
+      if c == "|" then break end
+      if c == "o" then -- Offset (pure 32 bit displacement).
+	wputdarg(args[1].disp); if narg < 2 then narg = 2 end
+      elseif c == "O" then
+	wputdarg(args[2].disp); narg = 3
+      else
+	-- Anything else is an immediate operand.
+	local a = args[narg]
+	narg = narg + 1
+	local mode, imm = a.mode, a.imm
+	if mode == "iJ" and not match("iIJ", c) then
+	  werror("bad operand size for label")
+	end
+	if c == "S" then
+	  wputsbarg(imm)
+	elseif c == "U" then
+	  wputbarg(imm)
+	elseif c == "W" then
+	  wputwarg(imm)
+	elseif c == "i" or c == "I" then
+	  if mode == "iJ" then
+	    wputlabel("IMM_", imm, 1)
+	  elseif mode == "iI" and c == "I" then
+	    waction(sz == "w" and "IMM_WB" or "IMM_DB", imm)
+	  else
+	    wputszarg(sz, imm)
+	  end
+	elseif c == "J" then
+	  if mode == "iPJ" then
+	    waction("REL_A", imm) -- !x64 (secpos)
+	  else
+	    wputlabel("REL_", imm, 2)
+	  end
+	else
+	  werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
+	end
+      end
+    end
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Mapping of operand modes to short names. Suppress output with '#'.
+local map_modename = {
+  r = "reg", R = "eax", C = "cl", x = "mem", m = "mrm", i = "imm",
+  f = "stx", F = "st0", J = "lbl", ["1"] = "1",
+  I = "#", S = "#", O = "#",
+}
+
+-- Return a table/string showing all possible operand modes.
+local function templatehelp(template, nparams)
+  if nparams == 0 then return "" end
+  local t = {}
+  for tm in gmatch(template, "[^%|]+") do
+    local s = map_modename[sub(tm, 1, 1)]
+    s = s..gsub(sub(tm, 2, nparams), ".", function(c)
+      return ", "..map_modename[c]
+    end)
+    if not match(s, "#") then t[#t+1] = s end
+  end
+  return t
+end
+
+-- Match operand modes against mode match part of template.
+local function matchtm(tm, args)
+  for i=1,#args do
+    if not match(args[i].mode, sub(tm, i, i)) then return end
+  end
+  return true
+end
+
+-- Handle opcodes defined with template strings.
+map_op[".template__"] = function(params, template, nparams)
+  if not params then return templatehelp(template, nparams) end
+  local args = {}
+
+  -- Zero-operand opcodes have no match part.
+  if #params == 0 then
+    dopattern(template, args, "d", params.op)
+    return
+  end
+
+  -- Determine common operand size (coerce undefined size) or flag as mixed.
+  local sz, szmix
+  for i,p in ipairs(params) do
+    args[i] = parseoperand(p)
+    local nsz = args[i].opsize
+    if nsz then
+      if sz and sz ~= nsz then szmix = true else sz = nsz end
+    end
+  end
+
+  -- Try all match:pattern pairs (separated by '|').
+  local gotmatch, lastpat
+  for tm in gmatch(template, "[^%|]+") do
+    -- Split off size match (starts after mode match) and pattern string.
+    local szm, pat = match(tm, "^(.-):(.*)$", #args+1)
+    if pat == "" then pat = lastpat else lastpat = pat end
+    if matchtm(tm, args) then
+      local prefix = sub(szm, 1, 1)
+      if prefix == "/" then -- Match both operand sizes.
+	if args[1].opsize == sub(szm, 2, 2) and
+	   args[2].opsize == sub(szm, 3, 3) then
+	  dopattern(pat, args, sz, params.op) -- Process pattern string.
+	  return
+	end
+      else -- Match common operand size.
+	local szp = sz
+	if szm == "" then szm = "dwb" end -- Default size match.
+	if prefix == "1" then szp = args[1].opsize; szmix = nil
+	elseif prefix == "2" then szp = args[2].opsize; szmix = nil end
+	if not szmix and (prefix == "." or match(szm, szp or "#")) then
+	  dopattern(pat, args, szp, params.op) -- Process pattern string.
+	  return
+	end
+      end
+      gotmatch = true
+    end
+  end
+
+  local msg = "bad operand mode"
+  if gotmatch then
+    if szmix then
+      msg = "mixed operand size"
+    else
+      msg = sz and "bad operand size" or "missing operand size"
+    end
+  end
+
+  werror(msg.." in `"..opmodestr(params.op, args).."'")
+end
+
+------------------------------------------------------------------------------
+
+-- Pseudo-opcodes for data storage.
+local function op_data(params)
+  if not params then return "imm..." end
+  local sz = sub(params.op, 2, 2)
+  if sz == "a" then sz = addrsize end
+  for _,p in ipairs(params) do
+    local a = parseoperand(p)
+    if sub(a.mode, 1, 1) ~= "i" or (a.opsize and a.opsize ~= sz) then
+      werror("bad mode or size in `"..p.."'")
+    end
+    if a.mode == "iJ" then
+      wputlabel("IMM_", a.imm, 1)
+    else
+      wputszarg(sz, a.imm)
+    end
+  end
+end
+
+map_op[".byte_*"] = op_data
+map_op[".sbyte_*"] = op_data
+map_op[".word_*"] = op_data
+map_op[".dword_*"] = op_data
+map_op[".aword_*"] = op_data
+
+------------------------------------------------------------------------------
+
+-- Pseudo-opcode to mark the position where the action list is to be emitted.
+map_op[".actionlist_1"] = function(params)
+  if not params then return "cvar" end
+  local name = params[1] -- No syntax check. You get to keep the pieces.
+  wline(function(out) writeactions(out, name) end)
+end
+
+-- Pseudo-opcode to mark the position where the global enum is to be emitted.
+map_op[".globals_1"] = function(params)
+  if not params then return "prefix" end
+  local prefix = params[1] -- No syntax check. You get to keep the pieces.
+  wline(function(out) writeglobals(out, prefix) end)
+end
+
+-- Pseudo-opcode to mark the position where the global names are to be emitted.
+map_op[".globalnames_1"] = function(params)
+  if not params then return "cvar" end
+  local name = params[1] -- No syntax check. You get to keep the pieces.
+  wline(function(out) writeglobalnames(out, name) end)
+end
+
+-- Pseudo-opcode to mark the position where the extern names are to be emitted.
+map_op[".externnames_1"] = function(params)
+  if not params then return "cvar" end
+  local name = params[1] -- No syntax check. You get to keep the pieces.
+  wline(function(out) writeexternnames(out, name) end)
+end
+
+------------------------------------------------------------------------------
+
+-- Label pseudo-opcode (converted from trailing colon form).
+map_op[".label_2"] = function(params)
+  if not params then return "[1-9] | ->global | =>pcexpr  [, addr]" end
+  local a = parseoperand(params[1])
+  local mode, imm = a.mode, a.imm
+  if type(imm) == "number" and (mode == "iJ" or (imm >= 1 and imm <= 9)) then
+    -- Local label (1: ... 9:) or global label (->global:).
+    waction("LABEL_LG", nil, 1)
+    wputxb(imm)
+  elseif mode == "iJ" then
+    -- PC label (=>pcexpr:).
+    waction("LABEL_PC", imm)
+  else
+    werror("bad label definition")
+  end
+  -- SETLABEL must immediately follow LABEL_LG/LABEL_PC.
+  local addr = params[2]
+  if addr then
+    local a = parseoperand(params[2])
+    if a.mode == "iPJ" then
+      waction("SETLABEL", a.imm) -- !x64 (secpos)
+    else
+      werror("bad label assignment")
+    end
+  end
+end
+map_op[".label_1"] = map_op[".label_2"]
+
+------------------------------------------------------------------------------
+
+-- Alignment pseudo-opcode.
+map_op[".align_1"] = function(params)
+  if not params then return "numpow2" end
+  local align = tonumber(params[1]) or map_opsizenum[map_opsize[params[1]]]
+  if align then
+    local x = align
+    -- Must be a power of 2 in the range (2 ... 256).
+    for i=1,8 do
+      x = x / 2
+      if x == 1 then
+	waction("ALIGN", nil, 1)
+	wputxb(align-1) -- Action byte is 2**n-1.
+	return
+      end
+    end
+  end
+  werror("bad alignment")
+end
+
+-- Spacing pseudo-opcode.
+map_op[".space_2"] = function(params)
+  if not params then return "num [, filler]" end
+  waction("SPACE", params[1])
+  local fill = params[2]
+  if fill then
+    fill = tonumber(fill)
+    if not fill or fill < 0 or fill > 255 then werror("bad filler") end
+  end
+  wputxb(fill or 0)
+end
+map_op[".space_1"] = map_op[".space_2"]
+
+------------------------------------------------------------------------------
+
+-- Pseudo-opcode for (primitive) type definitions (map to C types).
+map_op[".type_3"] = function(params, nparams)
+  if not params then
+    return nparams == 2 and "name, ctype" or "name, ctype, reg"
+  end
+  local name, ctype, reg = params[1], params[2], params[3]
+  if not match(name, "^[%a_][%w_]*$") then
+    werror("bad type name `"..name.."'")
+  end
+  local tp = map_type[name]
+  if tp then
+    werror("duplicate type `"..name.."'")
+  end
+  if reg and not map_reg_valid_base[reg] then
+    werror("bad base register `"..(map_reg_rev[reg] or reg).."'")
+  end
+  -- Add #type to defines. A bit unclean to put it in map_archdef.
+  map_archdef["#"..name] = "sizeof("..ctype..")"
+  -- Add new type and emit shortcut define.
+  local num = ctypenum + 1
+  map_type[name] = {
+    ctype = ctype,
+    ctypefmt = format("Dt%X(%%s)", num),
+    reg = reg,
+  }
+  wline(format("#define Dt%X(_V) (int)(ptrdiff_t)&(((%s *)0)_V)", num, ctype))
+  ctypenum = num
+end
+map_op[".type_2"] = map_op[".type_3"]
+
+-- Dump type definitions.
+local function dumptypes(out, lvl)
+  local t = {}
+  for name in pairs(map_type) do t[#t+1] = name end
+  sort(t)
+  out:write("Type definitions:\n")
+  for _,name in ipairs(t) do
+    local tp = map_type[name]
+    local reg = tp.reg and map_reg_rev[tp.reg] or ""
+    out:write(format("  %-20s %-20s %s\n", name, tp.ctype, reg))
+  end
+  out:write("\n")
+end
+
+------------------------------------------------------------------------------
+
+-- Set the current section.
+function _M.section(num)
+  waction("SECTION")
+  wputxb(num)
+  wflush(true) -- SECTION is a terminal action.
+end
+
+------------------------------------------------------------------------------
+
+-- Dump architecture description.
+function _M.dumparch(out)
+  out:write(format("DynASM %s version %s, released %s\n\n",
+    _info.arch, _info.version, _info.release))
+  dumpregs(out)
+  dumpactions(out)
+end
+
+-- Dump all user defined elements.
+function _M.dumpdef(out, lvl)
+  dumptypes(out, lvl)
+  dumpglobals(out, lvl)
+  dumpexterns(out, lvl)
+end
+
+------------------------------------------------------------------------------
+
+-- Pass callbacks from/to the DynASM core.
+function _M.passcb(wl, we, wf, ww)
+  wline, werror, wfatal, wwarn = wl, we, wf, ww
+  return wflush
+end
+
+-- Setup the arch-specific module.
+function _M.setup(arch, opt)
+  g_arch, g_opt = arch, opt
+end
+
+-- Merge the core maps and the arch-specific maps.
+function _M.mergemaps(map_coreop, map_def)
+  setmetatable(map_op, { __index = map_coreop })
+  setmetatable(map_def, { __index = map_archdef })
+  return map_op, map_def
+end
+
+return _M
+
+------------------------------------------------------------------------------
+

+ 1070 - 0
dynasm/dynasm.lua

@@ -0,0 +1,1070 @@
+------------------------------------------------------------------------------
+-- DynASM. A dynamic assembler for code generation engines.
+-- Originally designed and implemented for LuaJIT.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- See below for full copyright notice.
+------------------------------------------------------------------------------
+
+-- Application information.
+local _info = {
+  name =	"DynASM",
+  description =	"A dynamic assembler for code generation engines",
+  version =	"1.2.1",
+  vernum =	 10201,
+  release =	"2009-04-16",
+  author =	"Mike Pall",
+  url =		"http://luajit.org/dynasm.html",
+  license =	"MIT",
+  copyright =	[[
+Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+[ MIT license: http://www.opensource.org/licenses/mit-license.php ]
+]],
+}
+
+-- Cache library functions.
+local type, pairs, ipairs = type, pairs, ipairs
+local pcall, error, assert = pcall, error, assert
+local _s = string
+local sub, match, gmatch, gsub = _s.sub, _s.match, _s.gmatch, _s.gsub
+local format, rep, upper = _s.format, _s.rep, _s.upper
+local _t = table
+local insert, remove, concat, sort = _t.insert, _t.remove, _t.concat, _t.sort
+local exit = os.exit
+local io = io
+local stdin, stdout, stderr = io.stdin, io.stdout, io.stderr
+
+------------------------------------------------------------------------------
+
+-- Program options.
+local g_opt = {}
+
+-- Global state for current file.
+local g_fname, g_curline, g_indent, g_lineno, g_synclineno, g_arch
+local g_errcount = 0
+
+-- Write buffer for output file.
+local g_wbuffer, g_capbuffer
+
+------------------------------------------------------------------------------
+
+-- Write an output line (or callback function) to the buffer.
+local function wline(line, needindent)
+  local buf = g_capbuffer or g_wbuffer
+  buf[#buf+1] = needindent and g_indent..line or line
+  g_synclineno = g_synclineno + 1
+end
+
+-- Write assembler line as a comment, if requestd.
+local function wcomment(aline)
+  if g_opt.comment then
+    wline(g_opt.comment..aline..g_opt.endcomment, true)
+  end
+end
+
+-- Resync CPP line numbers.
+local function wsync()
+  if g_synclineno ~= g_lineno and g_opt.cpp then
+    wline("# "..g_lineno..' "'..g_fname..'"')
+    g_synclineno = g_lineno
+  end
+end
+
+-- Dummy action flush function. Replaced with arch-specific function later.
+local function wflush(term)
+end
+
+-- Dump all buffered output lines.
+local function wdumplines(out, buf)
+  for _,line in ipairs(buf) do
+    if type(line) == "string" then
+      assert(out:write(line, "\n"))
+    else
+      -- Special callback to dynamically insert lines after end of processing.
+      line(out)
+    end
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Emit an error. Processing continues with next statement.
+local function werror(msg)
+  error(format("%s:%s: error: %s:\n%s", g_fname, g_lineno, msg, g_curline), 0)
+end
+
+-- Emit a fatal error. Processing stops.
+local function wfatal(msg)
+  g_errcount = "fatal"
+  werror(msg)
+end
+
+-- Print a warning. Processing continues.
+local function wwarn(msg)
+  stderr:write(format("%s:%s: warning: %s:\n%s\n",
+    g_fname, g_lineno, msg, g_curline))
+end
+
+-- Print caught error message. But suppress excessive errors.
+local function wprinterr(...)
+  if type(g_errcount) == "number" then
+    -- Regular error.
+    g_errcount = g_errcount + 1
+    if g_errcount < 21 then -- Seems to be a reasonable limit.
+      stderr:write(...)
+    elseif g_errcount == 21 then
+      stderr:write(g_fname,
+	":*: warning: too many errors (suppressed further messages).\n")
+    end
+  else
+    -- Fatal error.
+    stderr:write(...)
+    return true -- Stop processing.
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Map holding all option handlers.
+local opt_map = {}
+local opt_current
+
+-- Print error and exit with error status.
+local function opterror(...)
+  stderr:write("dynasm.lua: ERROR: ", ...)
+  stderr:write("\n")
+  exit(1)
+end
+
+-- Get option parameter.
+local function optparam(args)
+  local argn = args.argn
+  local p = args[argn]
+  if not p then
+    opterror("missing parameter for option `", opt_current, "'.")
+  end
+  args.argn = argn + 1
+  return p
+end
+
+------------------------------------------------------------------------------
+
+-- Core pseudo-opcodes.
+local map_coreop = {}
+-- Dummy opcode map. Replaced by arch-specific map.
+local map_op = {}
+
+-- Forward declarations.
+local dostmt
+local readfile
+
+------------------------------------------------------------------------------
+
+-- Map for defines (initially empty, chains to arch-specific map).
+local map_def = {}
+
+-- Pseudo-opcode to define a substitution.
+map_coreop[".define_2"] = function(params, nparams)
+  if not params then return nparams == 1 and "name" or "name, subst" end
+  local name, def = params[1], params[2] or "1"
+  if not match(name, "^[%a_][%w_]*$") then werror("bad or duplicate define") end
+  map_def[name] = def
+end
+map_coreop[".define_1"] = map_coreop[".define_2"]
+
+-- Define a substitution on the command line.
+function opt_map.D(args)
+  local namesubst = optparam(args)
+  local name, subst = match(namesubst, "^([%a_][%w_]*)=(.*)$")
+  if name then
+    map_def[name] = subst
+  elseif match(namesubst, "^[%a_][%w_]*$") then
+    map_def[namesubst] = "1"
+  else
+    opterror("bad define")
+  end
+end
+
+-- Undefine a substitution on the command line.
+function opt_map.U(args)
+  local name = optparam(args)
+  if match(name, "^[%a_][%w_]*$") then
+    map_def[name] = nil
+  else
+    opterror("bad define")
+  end
+end
+
+-- Helper for definesubst.
+local gotsubst
+
+local function definesubst_one(word)
+  local subst = map_def[word]
+  if subst then gotsubst = word; return subst else return word end
+end
+
+-- Iteratively substitute defines.
+local function definesubst(stmt)
+  -- Limit number of iterations.
+  for i=1,100 do
+    gotsubst = false
+    stmt = gsub(stmt, "#?[%w_]+", definesubst_one)
+    if not gotsubst then break end
+  end
+  if gotsubst then wfatal("recursive define involving `"..gotsubst.."'") end
+  return stmt
+end
+
+-- Dump all defines.
+local function dumpdefines(out, lvl)
+  local t = {}
+  for name in pairs(map_def) do
+    t[#t+1] = name
+  end
+  sort(t)
+  out:write("Defines:\n")
+  for _,name in ipairs(t) do
+    local subst = map_def[name]
+    if g_arch then subst = g_arch.revdef(subst) end
+    out:write(format("  %-20s %s\n", name, subst))
+  end
+  out:write("\n")
+end
+
+------------------------------------------------------------------------------
+
+-- Support variables for conditional assembly.
+local condlevel = 0
+local condstack = {}
+
+-- Evaluate condition with a Lua expression. Substitutions already performed.
+local function cond_eval(cond)
+  local func, err = loadstring("return "..cond)
+  if func then
+    setfenv(func, {}) -- No globals. All unknown identifiers evaluate to nil.
+    local ok, res = pcall(func)
+    if ok then
+      if res == 0 then return false end -- Oh well.
+      return not not res
+    end
+    err = res
+  end
+  wfatal("bad condition: "..err)
+end
+
+-- Skip statements until next conditional pseudo-opcode at the same level.
+local function stmtskip()
+  local dostmt_save = dostmt
+  local lvl = 0
+  dostmt = function(stmt)
+    local op = match(stmt, "^%s*(%S+)")
+    if op == ".if" then
+      lvl = lvl + 1
+    elseif lvl ~= 0 then
+      if op == ".endif" then lvl = lvl - 1 end
+    elseif op == ".elif" or op == ".else" or op == ".endif" then
+      dostmt = dostmt_save
+      dostmt(stmt)
+    end
+  end
+end
+
+-- Pseudo-opcodes for conditional assembly.
+map_coreop[".if_1"] = function(params)
+  if not params then return "condition" end
+  local lvl = condlevel + 1
+  local res = cond_eval(params[1])
+  condlevel = lvl
+  condstack[lvl] = res
+  if not res then stmtskip() end
+end
+
+map_coreop[".elif_1"] = function(params)
+  if not params then return "condition" end
+  if condlevel == 0 then wfatal(".elif without .if") end
+  local lvl = condlevel
+  local res = condstack[lvl]
+  if res then
+    if res == "else" then wfatal(".elif after .else") end
+  else
+    res = cond_eval(params[1])
+    if res then
+      condstack[lvl] = res
+      return
+    end
+  end
+  stmtskip()
+end
+
+map_coreop[".else_0"] = function(params)
+  if condlevel == 0 then wfatal(".else without .if") end
+  local lvl = condlevel
+  local res = condstack[lvl]
+  condstack[lvl] = "else"
+  if res then
+    if res == "else" then wfatal(".else after .else") end
+    stmtskip()
+  end
+end
+
+map_coreop[".endif_0"] = function(params)
+  local lvl = condlevel
+  if lvl == 0 then wfatal(".endif without .if") end
+  condlevel = lvl - 1
+end
+
+-- Check for unfinished conditionals.
+local function checkconds()
+  if g_errcount ~= "fatal" and condlevel ~= 0 then
+    wprinterr(g_fname, ":*: error: unbalanced conditional\n")
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Search for a file in the given path and open it for reading.
+local function pathopen(path, name)
+  local dirsep = match(package.path, "\\") and "\\" or "/"
+  for _,p in ipairs(path) do
+    local fullname = p == "" and name or p..dirsep..name
+    local fin = io.open(fullname, "r")
+    if fin then
+      g_fname = fullname
+      return fin
+    end
+  end
+end
+
+-- Include a file.
+map_coreop[".include_1"] = function(params)
+  if not params then return "filename" end
+  local name = params[1]
+  -- Save state. Ugly, I know. but upvalues are fast.
+  local gf, gl, gcl, gi = g_fname, g_lineno, g_curline, g_indent
+  -- Read the included file.
+  local fatal = readfile(pathopen(g_opt.include, name) or
+			 wfatal("include file `"..name.."' not found"))
+  -- Restore state.
+  g_synclineno = -1
+  g_fname, g_lineno, g_curline, g_indent = gf, gl, gcl, gi
+  if fatal then wfatal("in include file") end
+end
+
+-- Make .include initially available, too.
+map_op[".include_1"] = map_coreop[".include_1"]
+
+------------------------------------------------------------------------------
+
+-- Support variables for macros.
+local mac_capture, mac_lineno, mac_name
+local mac_active = {}
+local mac_list = {}
+
+-- Pseudo-opcode to define a macro.
+map_coreop[".macro_*"] = function(mparams)
+  if not mparams then return "name [, params...]" end
+  -- Split off and validate macro name.
+  local name = remove(mparams, 1)
+  if not name then werror("missing macro name") end
+  if not (match(name, "^[%a_][%w_%.]*$") or match(name, "^%.[%w_%.]+$")) then
+    wfatal("bad macro name `"..name.."'")
+  end
+  -- Validate macro parameter names.
+  local mdup = {}
+  for _,mp in ipairs(mparams) do
+    if not match(mp, "^[%a_][%w_]*$") then
+      wfatal("bad macro parameter name `"..mp.."'")
+    end
+    if mdup[mp] then wfatal("duplicate macro parameter name `"..mp.."'") end
+    mdup[mp] = true
+  end
+  -- Check for duplicate or recursive macro definitions.
+  local opname = name.."_"..#mparams
+  if map_op[opname] or map_op[name.."_*"] then
+    wfatal("duplicate macro `"..name.."' ("..#mparams.." parameters)")
+  end
+  if mac_capture then wfatal("recursive macro definition") end
+
+  -- Enable statement capture.
+  local lines = {}
+  mac_lineno = g_lineno
+  mac_name = name
+  mac_capture = function(stmt) -- Statement capture function.
+    -- Stop macro definition with .endmacro pseudo-opcode.
+    if not match(stmt, "^%s*.endmacro%s*$") then
+      lines[#lines+1] = stmt
+      return
+    end
+    mac_capture = nil
+    mac_lineno = nil
+    mac_name = nil
+    mac_list[#mac_list+1] = opname
+    -- Add macro-op definition.
+    map_op[opname] = function(params)
+      if not params then return mparams, lines end
+      -- Protect against recursive macro invocation.
+      if mac_active[opname] then wfatal("recursive macro invocation") end
+      mac_active[opname] = true
+      -- Setup substitution map.
+      local subst = {}
+      for i,mp in ipairs(mparams) do subst[mp] = params[i] end
+      local mcom
+      if g_opt.maccomment and g_opt.comment then
+	mcom = " MACRO "..name.." ("..#mparams..")"
+	wcomment("{"..mcom)
+      end
+      -- Loop through all captured statements
+      for _,stmt in ipairs(lines) do
+	-- Substitute macro parameters.
+	local st = gsub(stmt, "[%w_]+", subst)
+	st = definesubst(st)
+	st = gsub(st, "%s*%.%.%s*", "") -- Token paste a..b.
+	if mcom and sub(st, 1, 1) ~= "|" then wcomment(st) end
+	-- Emit statement. Use a protected call for better diagnostics.
+	local ok, err = pcall(dostmt, st)
+	if not ok then
+	  -- Add the captured statement to the error.
+	  wprinterr(err, "\n", g_indent, "|  ", stmt,
+		    "\t[MACRO ", name, " (", #mparams, ")]\n")
+	end
+      end
+      if mcom then wcomment("}"..mcom) end
+      mac_active[opname] = nil
+    end
+  end
+end
+
+-- An .endmacro pseudo-opcode outside of a macro definition is an error.
+map_coreop[".endmacro_0"] = function(params)
+  wfatal(".endmacro without .macro")
+end
+
+-- Dump all macros and their contents (with -PP only).
+local function dumpmacros(out, lvl)
+  sort(mac_list)
+  out:write("Macros:\n")
+  for _,opname in ipairs(mac_list) do
+    local name = sub(opname, 1, -3)
+    local params, lines = map_op[opname]()
+    out:write(format("  %-20s %s\n", name, concat(params, ", ")))
+    if lvl > 1 then
+      for _,line in ipairs(lines) do
+	out:write("  |", line, "\n")
+      end
+      out:write("\n")
+    end
+  end
+  out:write("\n")
+end
+
+-- Check for unfinished macro definitions.
+local function checkmacros()
+  if mac_capture then
+    wprinterr(g_fname, ":", mac_lineno,
+	      ": error: unfinished .macro `", mac_name ,"'\n")
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Support variables for captures.
+local cap_lineno, cap_name
+local cap_buffers = {}
+local cap_used = {}
+
+-- Start a capture.
+map_coreop[".capture_1"] = function(params)
+  if not params then return "name" end
+  wflush()
+  local name = params[1]
+  if not match(name, "^[%a_][%w_]*$") then
+    wfatal("bad capture name `"..name.."'")
+  end
+  if cap_name then
+    wfatal("already capturing to `"..cap_name.."' since line "..cap_lineno)
+  end
+  cap_name = name
+  cap_lineno = g_lineno
+  -- Create or continue a capture buffer and start the output line capture.
+  local buf = cap_buffers[name]
+  if not buf then buf = {}; cap_buffers[name] = buf end
+  g_capbuffer = buf
+  g_synclineno = 0
+end
+
+-- Stop a capture.
+map_coreop[".endcapture_0"] = function(params)
+  wflush()
+  if not cap_name then wfatal(".endcapture without a valid .capture") end
+  cap_name = nil
+  cap_lineno = nil
+  g_capbuffer = nil
+  g_synclineno = 0
+end
+
+-- Dump a capture buffer.
+map_coreop[".dumpcapture_1"] = function(params)
+  if not params then return "name" end
+  wflush()
+  local name = params[1]
+  if not match(name, "^[%a_][%w_]*$") then
+    wfatal("bad capture name `"..name.."'")
+  end
+  cap_used[name] = true
+  wline(function(out)
+    local buf = cap_buffers[name]
+    if buf then wdumplines(out, buf) end
+  end)
+  g_synclineno = 0
+end
+
+-- Dump all captures and their buffers (with -PP only).
+local function dumpcaptures(out, lvl)
+  out:write("Captures:\n")
+  for name,buf in pairs(cap_buffers) do
+    out:write(format("  %-20s %4s)\n", name, "("..#buf))
+    if lvl > 1 then
+      local bar = rep("=", 76)
+      out:write("  ", bar, "\n")
+      for _,line in ipairs(buf) do
+	out:write("  ", line, "\n")
+      end
+      out:write("  ", bar, "\n\n")
+    end
+  end
+  out:write("\n")
+end
+
+-- Check for unfinished or unused captures.
+local function checkcaptures()
+  if cap_name then
+    wprinterr(g_fname, ":", cap_lineno,
+	      ": error: unfinished .capture `", cap_name,"'\n")
+    return
+  end
+  for name in pairs(cap_buffers) do
+    if not cap_used[name] then
+      wprinterr(g_fname, ":*: error: missing .dumpcapture ", name ,"\n")
+    end
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Sections names.
+local map_sections = {}
+
+-- Pseudo-opcode to define code sections.
+-- TODO: Data sections, BSS sections. Needs extra C code and API.
+map_coreop[".section_*"] = function(params)
+  if not params then return "name..." end
+  if #map_sections > 0 then werror("duplicate section definition") end
+  wflush()
+  for sn,name in ipairs(params) do
+    local opname = "."..name.."_0"
+    if not match(name, "^[%a][%w_]*$") or
+       map_op[opname] or map_op["."..name.."_*"] then
+      werror("bad section name `"..name.."'")
+    end
+    map_sections[#map_sections+1] = name
+    wline(format("#define DASM_SECTION_%s\t%d", upper(name), sn-1))
+    map_op[opname] = function(params) g_arch.section(sn-1) end
+  end
+  wline(format("#define DASM_MAXSECTION\t\t%d", #map_sections))
+end
+
+-- Dump all sections.
+local function dumpsections(out, lvl)
+  out:write("Sections:\n")
+  for _,name in ipairs(map_sections) do
+    out:write(format("  %s\n", name))
+  end
+  out:write("\n")
+end
+
+------------------------------------------------------------------------------
+
+-- Load architecture-specific module.
+local function loadarch(arch)
+  if not match(arch, "^[%w_]+$") then return "bad arch name" end
+  local ok, m_arch = pcall(require, "dasm_"..arch)
+  if not ok then return "cannot load module: "..m_arch end
+  g_arch = m_arch
+  wflush = m_arch.passcb(wline, werror, wfatal, wwarn)
+  m_arch.setup(arch, g_opt)
+  map_op, map_def = m_arch.mergemaps(map_coreop, map_def)
+end
+
+-- Dump architecture description.
+function opt_map.dumparch(args)
+  local name = optparam(args)
+  if not g_arch then
+    local err = loadarch(name)
+    if err then opterror(err) end
+  end
+
+  local t = {}
+  for name in pairs(map_coreop) do t[#t+1] = name end
+  for name in pairs(map_op) do t[#t+1] = name end
+  sort(t)
+
+  local out = stdout
+  local _arch = g_arch._info
+  out:write(format("%s version %s, released %s, %s\n",
+    _info.name, _info.version, _info.release, _info.url))
+  g_arch.dumparch(out)
+
+  local pseudo = true
+  out:write("Pseudo-Opcodes:\n")
+  for _,sname in ipairs(t) do
+    local name, nparam = match(sname, "^(.+)_([0-9%*])$")
+    if name then
+      if pseudo and sub(name, 1, 1) ~= "." then
+	out:write("\nOpcodes:\n")
+	pseudo = false
+      end
+      local f = map_op[sname]
+      local s
+      if nparam ~= "*" then nparam = nparam + 0 end
+      if nparam == 0 then
+	s = ""
+      elseif type(f) == "string" then
+	s = map_op[".template__"](nil, f, nparam)
+      else
+	s = f(nil, nparam)
+      end
+      if type(s) == "table" then
+	for _,s2 in ipairs(s) do
+	  out:write(format("  %-12s %s\n", name, s2))
+	end
+      else
+	out:write(format("  %-12s %s\n", name, s))
+      end
+    end
+  end
+  out:write("\n")
+  exit(0)
+end
+
+-- Pseudo-opcode to set the architecture.
+-- Only initially available (map_op is replaced when called).
+map_op[".arch_1"] = function(params)
+  if not params then return "name" end
+  local err = loadarch(params[1])
+  if err then wfatal(err) end
+end
+
+-- Dummy .arch pseudo-opcode to improve the error report.
+map_coreop[".arch_1"] = function(params)
+  if not params then return "name" end
+  wfatal("duplicate .arch statement")
+end
+
+------------------------------------------------------------------------------
+
+-- Dummy pseudo-opcode. Don't confuse '.nop' with 'nop'.
+map_coreop[".nop_*"] = function(params)
+  if not params then return "[ignored...]" end
+end
+
+-- Pseudo-opcodes to raise errors.
+map_coreop[".error_1"] = function(params)
+  if not params then return "message" end
+  werror(params[1])
+end
+
+map_coreop[".fatal_1"] = function(params)
+  if not params then return "message" end
+  wfatal(params[1])
+end
+
+-- Dump all user defined elements.
+local function dumpdef(out)
+  local lvl = g_opt.dumpdef
+  if lvl == 0 then return end
+  dumpsections(out, lvl)
+  dumpdefines(out, lvl)
+  if g_arch then g_arch.dumpdef(out, lvl) end
+  dumpmacros(out, lvl)
+  dumpcaptures(out, lvl)
+end
+
+------------------------------------------------------------------------------
+
+-- Helper for splitstmt.
+local splitlvl
+
+local function splitstmt_one(c)
+  if c == "(" then
+    splitlvl = ")"..splitlvl
+  elseif c == "[" then
+    splitlvl = "]"..splitlvl
+  elseif c == ")" or c == "]" then
+    if sub(splitlvl, 1, 1) ~= c then werror("unbalanced () or []") end
+    splitlvl = sub(splitlvl, 2)
+  elseif splitlvl == "" then
+    return " \0 "
+  end
+  return c
+end
+
+-- Split statement into (pseudo-)opcode and params.
+local function splitstmt(stmt)
+  -- Convert label with trailing-colon into .label statement.
+  local label = match(stmt, "^%s*(.+):%s*$")
+  if label then return ".label", {label} end
+
+  -- Split at commas and equal signs, but obey parentheses and brackets.
+  splitlvl = ""
+  stmt = gsub(stmt, "[,%(%)%[%]]", splitstmt_one)
+  if splitlvl ~= "" then werror("unbalanced () or []") end
+
+  -- Split off opcode.
+  local op, other = match(stmt, "^%s*([^%s%z]+)%s*(.*)$")
+  if not op then werror("bad statement syntax") end
+
+  -- Split parameters.
+  local params = {}
+  for p in gmatch(other, "%s*(%Z+)%z?") do
+    params[#params+1] = gsub(p, "%s+$", "")
+  end
+  if #params > 16 then werror("too many parameters") end
+
+  params.op = op
+  return op, params
+end
+
+-- Process a single statement.
+dostmt = function(stmt)
+  -- Ignore empty statements.
+  if match(stmt, "^%s*$") then return end
+
+  -- Capture macro defs before substitution.
+  if mac_capture then return mac_capture(stmt) end
+  stmt = definesubst(stmt)
+
+  -- Emit C code without parsing the line.
+  if sub(stmt, 1, 1) == "|" then
+    local tail = sub(stmt, 2)
+    wflush()
+    if sub(tail, 1, 2) == "//" then wcomment(tail) else wline(tail, true) end
+    return
+  end
+
+  -- Split into (pseudo-)opcode and params.
+  local op, params = splitstmt(stmt)
+
+  -- Get opcode handler (matching # of parameters or generic handler).
+  local f = map_op[op.."_"..#params] or map_op[op.."_*"]
+  if not f then
+    if not g_arch then wfatal("first statement must be .arch") end
+    -- Improve error report.
+    for i=0,16 do
+      if map_op[op.."_"..i] then
+	werror("wrong number of parameters for `"..op.."'")
+      end
+    end
+    werror("unknown statement `"..op.."'")
+  end
+
+  -- Call opcode handler or special handler for template strings.
+  if type(f) == "string" then
+    map_op[".template__"](params, f)
+  else
+    f(params)
+  end
+end
+
+-- Process a single line.
+local function doline(line)
+  if g_opt.flushline then wflush() end
+
+  -- Assembler line?
+  local indent, aline = match(line, "^(%s*)%|(.*)$")
+  if not aline then
+    -- No, plain C code line, need to flush first.
+    wflush()
+    wsync()
+    wline(line, false)
+    return
+  end
+
+  g_indent = indent -- Remember current line indentation.
+
+  -- Emit C code (even from macros). Avoids echo and line parsing.
+  if sub(aline, 1, 1) == "|" then
+    if not mac_capture then
+      wsync()
+    elseif g_opt.comment then
+      wsync()
+      wcomment(aline)
+    end
+    dostmt(aline)
+    return
+  end
+
+  -- Echo assembler line as a comment.
+  if g_opt.comment then
+    wsync()
+    wcomment(aline)
+  end
+
+  -- Strip assembler comments.
+  aline = gsub(aline, "//.*$", "")
+
+  -- Split line into statements at semicolons.
+  if match(aline, ";") then
+    for stmt in gmatch(aline, "[^;]+") do dostmt(stmt) end
+  else
+    dostmt(aline)
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Write DynASM header.
+local function dasmhead(out)
+  out:write(format([[
+/*
+** This file has been pre-processed with DynASM.
+** %s
+** DynASM version %s, DynASM %s version %s
+** DO NOT EDIT! The original file is in "%s".
+*/
+
+#if DASM_VERSION != %d
+#error "Version mismatch between DynASM and included encoding engine"
+#endif
+
+]], _info.url,
+    _info.version, g_arch._info.arch, g_arch._info.version,
+    g_fname, _info.vernum))
+end
+
+-- Read input file.
+readfile = function(fin)
+  g_indent = ""
+  g_lineno = 0
+  g_synclineno = -1
+
+  -- Process all lines.
+  for line in fin:lines() do
+    g_lineno = g_lineno + 1
+    g_curline = line
+    local ok, err = pcall(doline, line)
+    if not ok and wprinterr(err, "\n") then return true end
+  end
+  wflush()
+
+  -- Close input file.
+  assert(fin == stdin or fin:close())
+end
+
+-- Write output file.
+local function writefile(outfile)
+  local fout
+
+  -- Open output file.
+  if outfile == nil or outfile == "-" then
+    fout = stdout
+  else
+    fout = assert(io.open(outfile, "w"))
+  end
+
+  -- Write all buffered lines
+  wdumplines(fout, g_wbuffer)
+
+  -- Close output file.
+  assert(fout == stdout or fout:close())
+
+  -- Optionally dump definitions.
+  dumpdef(fout == stdout and stderr or stdout)
+end
+
+-- Translate an input file to an output file.
+local function translate(infile, outfile)
+  g_wbuffer = {}
+  g_indent = ""
+  g_lineno = 0
+  g_synclineno = -1
+
+  -- Put header.
+  wline(dasmhead)
+
+  -- Read input file.
+  local fin
+  if infile == "-" then
+    g_fname = "(stdin)"
+    fin = stdin
+  else
+    g_fname = infile
+    fin = assert(io.open(infile, "r"))
+  end
+  readfile(fin)
+
+  -- Check for errors.
+  if not g_arch then
+    wprinterr(g_fname, ":*: error: missing .arch directive\n")
+  end
+  checkconds()
+  checkmacros()
+  checkcaptures()
+
+  if g_errcount ~= 0 then
+    stderr:write(g_fname, ":*: info: ", g_errcount, " error",
+      (type(g_errcount) == "number" and g_errcount > 1) and "s" or "",
+      " in input file -- no output file generated.\n")
+    dumpdef(stderr)
+    exit(1)
+  end
+
+  -- Write output file.
+  writefile(outfile)
+end
+
+------------------------------------------------------------------------------
+
+-- Print help text.
+function opt_map.help()
+  stdout:write("DynASM -- ", _info.description, ".\n")
+  stdout:write("DynASM ", _info.version, " ", _info.release, "  ", _info.url, "\n")
+  stdout:write[[
+
+Usage: dynasm [OPTION]... INFILE.dasc|-
+
+  -h, --help           Display this help text.
+  -V, --version        Display version and copyright information.
+
+  -o, --outfile FILE   Output file name (default is stdout).
+  -I, --include DIR    Add directory to the include search path.
+
+  -c, --ccomment       Use /* */ comments for assembler lines.
+  -C, --cppcomment     Use // comments for assembler lines (default).
+  -N, --nocomment      Suppress assembler lines in output.
+  -M, --maccomment     Show macro expansions as comments (default off).
+
+  -L, --nolineno       Suppress CPP line number information in output.
+  -F, --flushline      Flush action list for every line.
+
+  -D NAME[=SUBST]      Define a substitution.
+  -U NAME              Undefine a substitution.
+
+  -P, --dumpdef        Dump defines, macros, etc. Repeat for more output.
+  -A, --dumparch ARCH  Load architecture ARCH and dump description.
+]]
+  exit(0)
+end
+
+-- Print version information.
+function opt_map.version()
+  stdout:write(format("%s version %s, released %s\n%s\n\n%s",
+    _info.name, _info.version, _info.release, _info.url, _info.copyright))
+  exit(0)
+end
+
+-- Misc. options.
+function opt_map.outfile(args) g_opt.outfile = optparam(args) end
+function opt_map.include(args) insert(g_opt.include, 1, optparam(args)) end
+function opt_map.ccomment() g_opt.comment = "/*|"; g_opt.endcomment = " */" end
+function opt_map.cppcomment() g_opt.comment = "//|"; g_opt.endcomment = "" end
+function opt_map.nocomment() g_opt.comment = false end
+function opt_map.maccomment() g_opt.maccomment = true end
+function opt_map.nolineno() g_opt.cpp = false end
+function opt_map.flushline() g_opt.flushline = true end
+function opt_map.dumpdef() g_opt.dumpdef = g_opt.dumpdef + 1 end
+
+------------------------------------------------------------------------------
+
+-- Short aliases for long options.
+local opt_alias = {
+  h = "help", ["?"] = "help", V = "version",
+  o = "outfile", I = "include",
+  c = "ccomment", C = "cppcomment", N = "nocomment", M = "maccomment",
+  L = "nolineno", F = "flushline",
+  P = "dumpdef", A = "dumparch",
+}
+
+-- Parse single option.
+local function parseopt(opt, args)
+  opt_current = #opt == 1 and "-"..opt or "--"..opt
+  local f = opt_map[opt] or opt_map[opt_alias[opt]]
+  if not f then
+    opterror("unrecognized option `", opt_current, "'. Try `--help'.\n")
+  end
+  f(args)
+end
+
+-- Parse arguments.
+local function parseargs(args)
+  -- Default options.
+  g_opt.comment = "//|"
+  g_opt.endcomment = ""
+  g_opt.cpp = true
+  g_opt.dumpdef = 0
+  g_opt.include = { "" }
+
+  -- Process all option arguments.
+  args.argn = 1
+  repeat
+    local a = args[args.argn]
+    if not a then break end
+    local lopt, opt = match(a, "^%-(%-?)(.+)")
+    if not opt then break end
+    args.argn = args.argn + 1
+    if lopt == "" then
+      -- Loop through short options.
+      for o in gmatch(opt, ".") do parseopt(o, args) end
+    else
+      -- Long option.
+      parseopt(opt, args)
+    end
+  until false
+
+  -- Check for proper number of arguments.
+  local nargs = #args - args.argn + 1
+  if nargs ~= 1 then
+    if nargs == 0 then
+      if g_opt.dumpdef > 0 then return dumpdef(stdout) end
+    end
+    opt_map.help()
+  end
+
+  -- Translate a single input file to a single output file
+  -- TODO: Handle multiple files?
+  translate(args[args.argn], g_opt.outfile)
+end
+
+------------------------------------------------------------------------------
+
+-- Add the directory dynasm.lua resides in to the Lua module search path.
+local arg = arg
+if arg and arg[0] then
+  local prefix = match(arg[0], "^(.*[/\\])")
+  if prefix then package.path = prefix.."?.lua;"..package.path end
+end
+
+-- Start DynASM.
+parseargs{...}
+
+------------------------------------------------------------------------------
+

+ 41 - 0
etc/strict.lua

@@ -0,0 +1,41 @@
+--
+-- strict.lua
+-- checks uses of undeclared global variables
+-- All global variables must be 'declared' through a regular assignment
+-- (even assigning nil will do) in a main chunk before being used
+-- anywhere or assigned to inside a function.
+--
+
+local getinfo, error, rawset, rawget = debug.getinfo, error, rawset, rawget
+
+local mt = getmetatable(_G)
+if mt == nil then
+  mt = {}
+  setmetatable(_G, mt)
+end
+
+mt.__declared = {}
+
+local function what ()
+  local d = getinfo(3, "S")
+  return d and d.what or "C"
+end
+
+mt.__newindex = function (t, n, v)
+  if not mt.__declared[n] then
+    local w = what()
+    if w ~= "main" and w ~= "C" then
+      error("assign to undeclared variable '"..n.."'", 2)
+    end
+    mt.__declared[n] = true
+  end
+  rawset(t, n, v)
+end
+  
+mt.__index = function (t, n)
+  if not mt.__declared[n] and what() ~= "C" then
+    error("variable '"..n.."' is not declared", 2)
+  end
+  return rawget(t, n)
+end
+

+ 1 - 0
lib/.gitignore

@@ -0,0 +1 @@
+vmdef.lua

+ 182 - 0
lib/bc.lua

@@ -0,0 +1,182 @@
+----------------------------------------------------------------------------
+-- LuaJIT bytecode listing module.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- Released under the MIT/X license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+--
+-- This module lists the bytecode of a Lua function. If it's loaded by -jbc
+-- it hooks into the parser and lists all functions of a chunk as they
+-- are parsed.
+--
+-- Example usage:
+--
+--   luajit -jbc -e 'local x=0; for i=1,1e6 do x=x+i end; print(x)'
+--   luajit -jbc=- foo.lua
+--   luajit -jbc=foo.list foo.lua
+--
+-- Default output is to stderr. To redirect the output to a file, pass a
+-- filename as an argument (use '-' for stdout) or set the environment
+-- variable LUAJIT_LISTFILE. The file is overwritten every time the module
+-- is started.
+--
+-- This module can also be used programmatically:
+--
+--   local bc = require("jit.bc")
+--
+--   local function foo() print("hello") end
+--
+--   bc.dump(foo)           --> -- BYTECODE -- [...]
+--   print(bc.line(foo, 2)) --> 0002    KSTR     1   1      ; "hello"
+--
+--   local out = {
+--     -- Do something wich each line:
+--     write = function(t, ...) io.write(...) end,
+--     close = function(t) end,
+--     flush = function(t) end,
+--   }
+--   bc.dump(foo, out)
+--
+------------------------------------------------------------------------------
+
+-- Cache some library functions and objects.
+local jit = require("jit")
+assert(jit.version_num == 20000, "LuaJIT core/library version mismatch")
+local jutil = require("jit.util")
+local vmdef = require("jit.vmdef")
+local bit = require("bit")
+local sub, gsub, format = string.sub, string.gsub, string.format
+local byte, band, shr = string.byte, bit.band, bit.rshift
+local funcinfo, funcbc, funck = jutil.funcinfo, jutil.funcbc, jutil.funck
+local funcuvname = jutil.funcuvname
+local bcnames = vmdef.bcnames
+local stdout, stderr = io.stdout, io.stderr
+
+------------------------------------------------------------------------------
+
+local function ctlsub(c)
+  if c == "\n" then return "\\n"
+  elseif c == "\r" then return "\\r"
+  elseif c == "\t" then return "\\t"
+  elseif c == "\r" then return "\\r"
+  else return format("\\%03d", byte(c))
+  end
+end
+
+-- Return one bytecode line.
+local function bcline(func, pc, prefix)
+  local ins, m = funcbc(func, pc)
+  if not ins then return end
+  local ma, mb, mc = band(m, 7), band(m, 15*8), band(m, 15*128)
+  local a = band(shr(ins, 8), 0xff)
+  local oidx = 6*band(ins, 0xff)
+  local s = format("%04d %s %-6s %3s ",
+    pc, prefix or "  ", sub(bcnames, oidx+1, oidx+6), ma == 0 and "" or a)
+  local d = shr(ins, 16)
+  if mc == 13*128 then -- BCMjump
+    if ma == 0 then
+      return format("%s=> %04d\n", sub(s, 1, -3), pc+d-0x7fff)
+    end
+    return format("%s=> %04d\n", s, pc+d-0x7fff)
+  end
+  if mb ~= 0 then d = band(d, 0xff) end
+  local kc
+  if mc == 10*128 then -- BCMstr
+    kc = funck(func, -d-1)
+    kc = format(#kc > 40 and '"%.40s"~' or '"%s"', gsub(kc, "%c", ctlsub))
+  elseif mc == 9*128 then -- BCMnum
+    kc = funck(func, d)
+  elseif mc == 12*128 then -- BCMfunc
+    local fi = funcinfo(funck(func, -d-1))
+    if fi.ffid then
+      kc = vmdef.ffnames[fi.ffid]
+    else
+      kc = fi.loc
+    end
+  elseif mc == 5*128 then -- BCMuv
+    kc = funcuvname(func, d)
+  end
+  if ma == 5 then -- BCMuv
+    local ka = funcuvname(func, a)
+    if kc then kc = ka.." ; "..kc else kc = ka end
+  end
+  if mb ~= 0 then
+    local b = shr(ins, 24)
+    if kc then return format("%s%3d %3d  ; %s\n", s, b, d, kc) end
+    return format("%s%3d %3d\n", s, b, d)
+  end
+  if kc then return format("%s%3d      ; %s\n", s, d, kc) end
+  if mc == 7*128 and d > 32767 then d = d - 65536 end -- BCMlits
+  return format("%s%3d\n", s, d)
+end
+
+-- Collect branch targets of a function.
+local function bctargets(func)
+  local target = {}
+  for pc=1,1000000000 do
+    local ins, m = funcbc(func, pc)
+    if not ins then break end
+    if band(m, 15*128) == 13*128 then target[pc+shr(ins, 16)-0x7fff] = true end
+  end
+  return target
+end
+
+-- Dump bytecode instructions of a function.
+local function bcdump(func, out)
+  if not out then out = stdout end
+  local fi = funcinfo(func)
+  out:write(format("-- BYTECODE -- %s-%d\n", fi.loc, fi.lastlinedefined))
+  local target = bctargets(func)
+  for pc=1,1000000000 do
+    local s = bcline(func, pc, target[pc] and "=>")
+    if not s then break end
+    out:write(s)
+  end
+  out:write("\n")
+  out:flush()
+end
+
+------------------------------------------------------------------------------
+
+-- Active flag and output file handle.
+local active, out
+
+-- List handler.
+local function h_list(func)
+  return bcdump(func, out)
+end
+
+-- Detach list handler.
+local function bclistoff()
+  if active then
+    active = false
+    jit.attach(h_list)
+    if out and out ~= stdout and out ~= stderr then out:close() end
+    out = nil
+  end
+end
+
+-- Open the output file and attach list handler.
+local function bcliston(outfile)
+  if active then bclistoff() end
+  if not outfile then outfile = os.getenv("LUAJIT_LISTFILE") end
+  if outfile then
+    out = outfile == "-" and stdout or assert(io.open(outfile, "w"))
+  else
+    out = stderr
+  end
+  jit.attach(h_list, "bc")
+  active = true
+end
+
+-- Public module functions.
+module(...)
+
+line = bcline
+dump = bcdump
+targets = bctargets
+
+on = bcliston
+off = bclistoff
+start = bcliston -- For -j command line option.
+

+ 19 - 0
lib/dis_x64.lua

@@ -0,0 +1,19 @@
+----------------------------------------------------------------------------
+-- LuaJIT x64 disassembler wrapper module.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- Released under the MIT/X license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+-- This module just exports the 64 bit functions from the combined
+-- x86/x64 disassembler module. All the interesting stuff is there.
+------------------------------------------------------------------------------
+
+local require = require
+
+module(...)
+
+local dis_x86 = require(_PACKAGE.."dis_x86")
+
+create = dis_x86.create64
+disass = dis_x86.disass64
+

+ 824 - 0
lib/dis_x86.lua

@@ -0,0 +1,824 @@
+----------------------------------------------------------------------------
+-- LuaJIT x86/x64 disassembler module.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- Released under the MIT/X license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+-- This is a helper module used by the LuaJIT machine code dumper module.
+--
+-- Sending small code snippets to an external disassembler and mixing the
+-- output with our own stuff was too fragile. So I had to bite the bullet
+-- and write yet another x86 disassembler. Oh well ...
+--
+-- The output format is very similar to what ndisasm generates. But it has
+-- been developed independently by looking at the opcode tables from the
+-- Intel and AMD manuals. The supported instruction set is quite extensive
+-- and reflects what a current generation Intel or AMD CPU implements in
+-- 32 bit and 64 bit mode. Yes, this includes MMX, SSE, SSE2, SSE3, SSSE3,
+-- SSE4.1, SSE4.2, SSE4a and even privileged and hypervisor (VMX/SVM)
+-- instructions.
+--
+-- Notes:
+-- * The (useless) a16 prefix, 3DNow and pre-586 opcodes are unsupported.
+-- * No attempt at optimization has been made -- it's fast enough for my needs.
+-- * The public API may change when more architectures are added.
+------------------------------------------------------------------------------
+
+local type = type
+local sub, byte, format = string.sub, string.byte, string.format
+local match, gmatch, gsub = string.match, string.gmatch, string.gsub
+local lower, rep = string.lower, string.rep
+
+-- Map for 1st opcode byte in 32 bit mode. Ugly? Well ... read on.
+local map_opc1_32 = {
+--0x
+[0]="addBmr","addVmr","addBrm","addVrm","addBai","addVai","push es","pop es",
+"orBmr","orVmr","orBrm","orVrm","orBai","orVai","push cs","opc2*",
+--1x
+"adcBmr","adcVmr","adcBrm","adcVrm","adcBai","adcVai","push ss","pop ss",
+"sbbBmr","sbbVmr","sbbBrm","sbbVrm","sbbBai","sbbVai","push ds","pop ds",
+--2x
+"andBmr","andVmr","andBrm","andVrm","andBai","andVai","es:seg","daa",
+"subBmr","subVmr","subBrm","subVrm","subBai","subVai","cs:seg","das",
+--3x
+"xorBmr","xorVmr","xorBrm","xorVrm","xorBai","xorVai","ss:seg","aaa",
+"cmpBmr","cmpVmr","cmpBrm","cmpVrm","cmpBai","cmpVai","ds:seg","aas",
+--4x
+"incVR","incVR","incVR","incVR","incVR","incVR","incVR","incVR",
+"decVR","decVR","decVR","decVR","decVR","decVR","decVR","decVR",
+--5x
+"pushUR","pushUR","pushUR","pushUR","pushUR","pushUR","pushUR","pushUR",
+"popUR","popUR","popUR","popUR","popUR","popUR","popUR","popUR",
+--6x
+"sz*pushaw,pusha","sz*popaw,popa","boundVrm","arplWmr",
+"fs:seg","gs:seg","o16:","a16",
+"pushUi","imulVrmi","pushBs","imulVrms",
+"insb","insVS","outsb","outsVS",
+--7x
+"joBj","jnoBj","jbBj","jnbBj","jzBj","jnzBj","jbeBj","jaBj",
+"jsBj","jnsBj","jpeBj","jpoBj","jlBj","jgeBj","jleBj","jgBj",
+--8x
+"arith!Bmi","arith!Vmi","arith!Bmi","arith!Vms",
+"testBmr","testVmr","xchgBrm","xchgVrm",
+"movBmr","movVmr","movBrm","movVrm",
+"movVmg","leaVrm","movWgm","popUm",
+--9x
+"nop*xchgVaR|pause|xchgWaR|repne nop","xchgVaR","xchgVaR","xchgVaR",
+"xchgVaR","xchgVaR","xchgVaR","xchgVaR",
+"sz*cbw,cwde,cdqe","sz*cwd,cdq,cqo","call farViw","wait",
+"sz*pushfw,pushf","sz*popfw,popf","sahf","lahf",
+--Ax
+"movBao","movVao","movBoa","movVoa",
+"movsb","movsVS","cmpsb","cmpsVS",
+"testBai","testVai","stosb","stosVS",
+"lodsb","lodsVS","scasb","scasVS",
+--Bx
+"movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi",
+"movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI",
+--Cx
+"shift!Bmu","shift!Vmu","retBw","ret","$lesVrm","$ldsVrm","movBmi","movVmi",
+"enterBwu","leave","retfBw","retf","int3","intBu","into","iretVS",
+--Dx
+"shift!Bm1","shift!Vm1","shift!Bmc","shift!Vmc","aamBu","aadBu","salc","xlatb",
+"fp*0","fp*1","fp*2","fp*3","fp*4","fp*5","fp*6","fp*7",
+--Ex
+"loopneBj","loopeBj","loopBj","sz*jcxzBj,jecxzBj,jrcxzBj",
+"inBau","inVau","outBua","outVua",
+"callVj","jmpVj","jmp farViw","jmpBj","inBad","inVad","outBda","outVda",
+--Fx
+"lock:","int1","repne:rep","rep:","hlt","cmc","testb!Bm","testv!Vm",
+"clc","stc","cli","sti","cld","std","incb!Bm","incd!Vm",
+}
+assert(#map_opc1_32 == 255)
+
+-- Map for 1st opcode byte in 64 bit mode (overrides only).
+local map_opc1_64 = setmetatable({
+  [0x06]=false, [0x07]=false, [0x0e]=false,
+  [0x16]=false, [0x17]=false, [0x1e]=false, [0x1f]=false,
+  [0x27]=false, [0x2f]=false, [0x37]=false, [0x3f]=false,
+  [0x60]=false, [0x61]=false, [0x62]=false, [0x63]="movsxdVrDmt", [0x67]="a32:",
+  [0x40]="rex*",   [0x41]="rex*b",   [0x42]="rex*x",   [0x43]="rex*xb",
+  [0x44]="rex*r",  [0x45]="rex*rb",  [0x46]="rex*rx",  [0x47]="rex*rxb",
+  [0x48]="rex*w",  [0x49]="rex*wb",  [0x4a]="rex*wx",  [0x4b]="rex*wxb",
+  [0x4c]="rex*wr", [0x4d]="rex*wrb", [0x4e]="rex*wrx", [0x4f]="rex*wrxb",
+  [0x82]=false, [0x9a]=false, [0xc4]=false, [0xc5]=false, [0xce]=false,
+  [0xd4]=false, [0xd5]=false, [0xd6]=false, [0xea]=false,
+}, { __index = map_opc1_32 })
+
+-- Map for 2nd opcode byte (0F xx). True CISC hell. Hey, I told you.
+-- Prefix dependent MMX/SSE opcodes: (none)|rep|o16|repne, -|F3|66|F2
+local map_opc2 = {
+--0x
+[0]="sldt!Dmp","sgdt!Ump","larVrm","lslVrm",nil,"syscall","clts","sysret",
+"invd","wbinvd",nil,"ud1",nil,"$prefetch!Bm","femms","3dnowMrmu",
+--1x
+"movupsXrm|movssXrm|movupdXrm|movsdXrm",
+"movupsXmr|movssXmr|movupdXmr|movsdXmr",
+"movhlpsXrm$movlpsXrm|movsldupXrm|movlpdXrm|movddupXrm",
+"movlpsXmr||movlpdXmr",
+"unpcklpsXrm||unpcklpdXrm",
+"unpckhpsXrm||unpckhpdXrm",
+"movlhpsXrm$movhpsXrm|movshdupXrm|movhpdXrm",
+"movhpsXmr||movhpdXmr",
+"$prefetcht!Bm","hintnopVm","hintnopVm","hintnopVm",
+"hintnopVm","hintnopVm","hintnopVm","hintnopVm",
+--2x
+"movUmx$","movUmy$","movUxm$","movUym$","movUmz$",nil,"movUzm$",nil,
+"movapsXrm||movapdXrm",
+"movapsXmr||movapdXmr",
+"cvtpi2psXrMm|cvtsi2ssXrVm|cvtpi2pdXrMm|cvtsi2sdXrVm",
+"movntpsXmr|movntssXmr|movntpdXmr|movntsdXmr",
+"cvttps2piMrXm|cvttss2siVrXm|cvttpd2piMrXm|cvttsd2siVrXm",
+"cvtps2piMrXm|cvtss2siVrXm|cvtpd2piMrXm|cvtsd2siVrXm",
+"ucomissXrm||ucomisdXrm",
+"comissXrm||comisdXrm",
+--3x
+"wrmsr","rdtsc","rdmsr","rdpmc","sysenter","sysexit",nil,"getsec",
+"opc3*38",nil,"opc3*3a",nil,nil,nil,nil,nil,
+--4x
+"cmovoVrm","cmovnoVrm","cmovbVrm","cmovnbVrm",
+"cmovzVrm","cmovnzVrm","cmovbeVrm","cmovaVrm",
+"cmovsVrm","cmovnsVrm","cmovpeVrm","cmovpoVrm",
+"cmovlVrm","cmovgeVrm","cmovleVrm","cmovgVrm",
+--5x
+"movmskpsVrXm$||movmskpdVrXm$","sqrtpsXrm|sqrtssXrm|sqrtpdXrm|sqrtsdXrm",
+"rsqrtpsXrm|rsqrtssXrm","rcppsXrm|rcpssXrm",
+"andpsXrm||andpdXrm","andnpsXrm||andnpdXrm",
+"orpsXrm||orpdXrm","xorpsXrm||xorpdXrm",
+"addpsXrm|addssXrm|addpdXrm|addsdXrm","mulpsXrm|mulssXrm|mulpdXrm|mulsdXrm",
+"cvtps2pdXrm|cvtss2sdXrm|cvtpd2psXrm|cvtsd2ssXrm",
+"cvtdq2psXrm|cvttps2dqXrm|cvtps2dqXrm",
+"subpsXrm|subssXrm|subpdXrm|subsdXrm","minpsXrm|minssXrm|minpdXrm|minsdXrm",
+"divpsXrm|divssXrm|divpdXrm|divsdXrm","maxpsXrm|maxssXrm|maxpdXrm|maxsdXrm",
+--6x
+"punpcklbwPrm","punpcklwdPrm","punpckldqPrm","packsswbPrm",
+"pcmpgtbPrm","pcmpgtwPrm","pcmpgtdPrm","packuswbPrm",
+"punpckhbwPrm","punpckhwdPrm","punpckhdqPrm","packssdwPrm",
+"||punpcklqdqXrm","||punpckhqdqXrm",
+"movPrVSm","movqMrm|movdquXrm|movdqaXrm",
+--7x
+"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pmu",
+"pshiftd!Pmu","pshiftq!Mmu||pshiftdq!Xmu",
+"pcmpeqbPrm","pcmpeqwPrm","pcmpeqdPrm","emms|",
+"vmreadUmr||extrqXmuu$|insertqXrmuu$","vmwriteUrm||extrqXrm$|insertqXrm$",
+nil,nil,
+"||haddpdXrm|haddpsXrm","||hsubpdXrm|hsubpsXrm",
+"movVSmMr|movqXrm|movVSmXr","movqMmr|movdquXmr|movdqaXmr",
+--8x
+"joVj","jnoVj","jbVj","jnbVj","jzVj","jnzVj","jbeVj","jaVj",
+"jsVj","jnsVj","jpeVj","jpoVj","jlVj","jgeVj","jleVj","jgVj",
+--9x
+"setoBm","setnoBm","setbBm","setnbBm","setzBm","setnzBm","setbeBm","setaBm",
+"setsBm","setnsBm","setpeBm","setpoBm","setlBm","setgeBm","setleBm","setgBm",
+--Ax
+"push fs","pop fs","cpuid","btVmr","shldVmru","shldVmrc",nil,nil,
+"push gs","pop gs","rsm","btsVmr","shrdVmru","shrdVmrc","fxsave!Dmp","imulVrm",
+--Bx
+"cmpxchgBmr","cmpxchgVmr","$lssVrm","btrVmr",
+"$lfsVrm","$lgsVrm","movzxVrBmt","movzxVrWmt",
+"|popcntVrm","ud2Dp","bt!Vmu","btcVmr",
+"bsfVrm","bsrVrm|lzcntVrm|bsrWrm","movsxVrBmt","movsxVrWmt",
+--Cx
+"xaddBmr","xaddVmr",
+"cmppsXrmu|cmpssXrmu|cmppdXrmu|cmpsdXrmu","$movntiVmr|",
+"pinsrwPrWmu","pextrwDrPmu",
+"shufpsXrmu||shufpdXrmu","$cmpxchg!Qmp",
+"bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR",
+--Dx
+"||addsubpdXrm|addsubpsXrm","psrlwPrm","psrldPrm","psrlqPrm",
+"paddqPrm","pmullwPrm",
+"|movq2dqXrMm|movqXmr|movdq2qMrXm$","pmovmskbVrMm||pmovmskbVrXm",
+"psubusbPrm","psubuswPrm","pminubPrm","pandPrm",
+"paddusbPrm","padduswPrm","pmaxubPrm","pandnPrm",
+--Ex
+"pavgbPrm","psrawPrm","psradPrm","pavgwPrm",
+"pmulhuwPrm","pmulhwPrm",
+"|cvtdq2pdXrm|cvttpd2dqXrm|cvtpd2dqXrm","$movntqMmr||$movntdqXmr",
+"psubsbPrm","psubswPrm","pminswPrm","porPrm",
+"paddsbPrm","paddswPrm","pmaxswPrm","pxorPrm",
+--Fx
+"|||lddquXrm","psllwPrm","pslldPrm","psllqPrm",
+"pmuludqPrm","pmaddwdPrm","psadbwPrm","maskmovqMrm||maskmovdquXrm$",
+"psubbPrm","psubwPrm","psubdPrm","psubqPrm",
+"paddbPrm","paddwPrm","padddPrm","ud",
+}
+assert(map_opc2[255] == "ud")
+
+-- Map for three-byte opcodes. Can't wait for their next invention.
+local map_opc3 = {
+["38"] = { -- [66] 0f 38 xx
+--0x
+[0]="pshufbPrm","phaddwPrm","phadddPrm","phaddswPrm",
+"pmaddubswPrm","phsubwPrm","phsubdPrm","phsubswPrm",
+"psignbPrm","psignwPrm","psigndPrm","pmulhrswPrm",
+nil,nil,nil,nil,
+--1x
+"||pblendvbXrma",nil,nil,nil,
+"||blendvpsXrma","||blendvpdXrma",nil,"||ptestXrm",
+nil,nil,nil,nil,
+"pabsbPrm","pabswPrm","pabsdPrm",nil,
+--2x
+"||pmovsxbwXrm","||pmovsxbdXrm","||pmovsxbqXrm","||pmovsxwdXrm",
+"||pmovsxwqXrm","||pmovsxdqXrm",nil,nil,
+"||pmuldqXrm","||pcmpeqqXrm","||$movntdqaXrm","||packusdwXrm",
+nil,nil,nil,nil,
+--3x
+"||pmovzxbwXrm","||pmovzxbdXrm","||pmovzxbqXrm","||pmovzxwdXrm",
+"||pmovzxwqXrm","||pmovzxdqXrm",nil,"||pcmpgtqXrm",
+"||pminsbXrm","||pminsdXrm","||pminuwXrm","||pminudXrm",
+"||pmaxsbXrm","||pmaxsdXrm","||pmaxuwXrm","||pmaxudXrm",
+--4x
+"||pmulddXrm","||phminposuwXrm",
+--Fx
+[0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt",
+},
+
+["3a"] = { -- [66] 0f 3a xx
+--0x
+[0x00]=nil,nil,nil,nil,nil,nil,nil,nil,
+"||roundpsXrmu","||roundpdXrmu","||roundssXrmu","||roundsdXrmu",
+"||blendpsXrmu","||blendpdXrmu","||pblendwXrmu","palignrPrmu",
+--1x
+nil,nil,nil,nil,
+"||pextrbVmXru","||pextrwVmXru","||pextrVmSXru","||extractpsVmXru",
+nil,nil,nil,nil,nil,nil,nil,nil,
+--2x
+"||pinsrbXrVmu","||insertpsXrmu","||pinsrXrVmuS",nil,
+--4x
+[0x40] = "||dppsXrmu",
+[0x41] = "||dppdXrmu",
+[0x42] = "||mpsadbwXrmu",
+--6x
+[0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu",
+[0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu",
+},
+}
+
+-- Map for VMX/SVM opcodes 0F 01 C0-FF (sgdt group with register operands).
+local map_opcvm = {
+[0xc1]="vmcall",[0xc2]="vmlaunch",[0xc3]="vmresume",[0xc4]="vmxoff",
+[0xc8]="monitor",[0xc9]="mwait",
+[0xd8]="vmrun",[0xd9]="vmmcall",[0xda]="vmload",[0xdb]="vmsave",
+[0xdc]="stgi",[0xdd]="clgi",[0xde]="skinit",[0xdf]="invlpga",
+[0xf8]="swapgs",[0xf9]="rdtscp",
+}
+
+-- Map for FP opcodes. And you thought stack machines are simple?
+local map_opcfp = {
+-- D8-DF 00-BF: opcodes with a memory operand.
+-- D8
+[0]="faddFm","fmulFm","fcomFm","fcompFm","fsubFm","fsubrFm","fdivFm","fdivrFm",
+"fldFm",nil,"fstFm","fstpFm","fldenvVm","fldcwWm","fnstenvVm","fnstcwWm",
+-- DA
+"fiaddDm","fimulDm","ficomDm","ficompDm",
+"fisubDm","fisubrDm","fidivDm","fidivrDm",
+-- DB
+"fildDm","fisttpDm","fistDm","fistpDm",nil,"fld twordFmp",nil,"fstp twordFmp",
+-- DC
+"faddGm","fmulGm","fcomGm","fcompGm","fsubGm","fsubrGm","fdivGm","fdivrGm",
+-- DD
+"fldGm","fisttpQm","fstGm","fstpGm","frstorDmp",nil,"fnsaveDmp","fnstswWm",
+-- DE
+"fiaddWm","fimulWm","ficomWm","ficompWm",
+"fisubWm","fisubrWm","fidivWm","fidivrWm",
+-- DF
+"fildWm","fisttpWm","fistWm","fistpWm",
+"fbld twordFmp","fildQm","fbstp twordFmp","fistpQm",
+-- xx C0-FF: opcodes with a pseudo-register operand.
+-- D8
+"faddFf","fmulFf","fcomFf","fcompFf","fsubFf","fsubrFf","fdivFf","fdivrFf",
+-- D9
+"fldFf","fxchFf",{"fnop"},nil,
+{"fchs","fabs",nil,nil,"ftst","fxam"},
+{"fld1","fldl2t","fldl2e","fldpi","fldlg2","fldln2","fldz"},
+{"f2xm1","fyl2x","fptan","fpatan","fxtract","fprem1","fdecstp","fincstp"},
+{"fprem","fyl2xp1","fsqrt","fsincos","frndint","fscale","fsin","fcos"},
+-- DA
+"fcmovbFf","fcmoveFf","fcmovbeFf","fcmovuFf",nil,{nil,"fucompp"},nil,nil,
+-- DB
+"fcmovnbFf","fcmovneFf","fcmovnbeFf","fcmovnuFf",
+{nil,nil,"fnclex","fninit"},"fucomiFf","fcomiFf",nil,
+-- DC
+"fadd toFf","fmul toFf",nil,nil,
+"fsub toFf","fsubr toFf","fdivr toFf","fdiv toFf",
+-- DD
+"ffreeFf",nil,"fstFf","fstpFf","fucomFf","fucompFf",nil,nil,
+-- DE
+"faddpFf","fmulpFf",nil,{nil,"fcompp"},
+"fsubrpFf","fsubpFf","fdivrpFf","fdivpFf",
+-- DF
+nil,nil,nil,nil,{"fnstsw ax"},"fucomipFf","fcomipFf",nil,
+}
+assert(map_opcfp[126] == "fcomipFf")
+
+-- Map for opcode groups. The subkey is sp from the ModRM byte.
+local map_opcgroup = {
+  arith = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" },
+  shift = { "rol", "ror", "rcl", "rcr", "shl", "shr", "sal", "sar" },
+  testb = { "testBmi", "testBmi", "not", "neg", "mul", "imul", "div", "idiv" },
+  testv = { "testVmi", "testVmi", "not", "neg", "mul", "imul", "div", "idiv" },
+  incb = { "inc", "dec" },
+  incd = { "inc", "dec", "callDmp", "$call farDmp",
+	   "jmpDmp", "$jmp farDmp", "pushUm" },
+  sldt = { "sldt", "str", "lldt", "ltr", "verr", "verw" },
+  sgdt = { "vm*$sgdt", "vm*$sidt", "$lgdt", "vm*$lidt",
+	   "smsw", nil, "lmsw", "vm*$invlpg" },
+  bt = { nil, nil, nil, nil, "bt", "bts", "btr", "btc" },
+  cmpxchg = { nil, "sz*,cmpxchg8bQmp,cmpxchg16bXmp", nil, nil,
+	      nil, nil, "vmptrld|vmxon|vmclear", "vmptrst" },
+  pshiftw = { nil, nil, "psrlw", nil, "psraw", nil, "psllw" },
+  pshiftd = { nil, nil, "psrld", nil, "psrad", nil, "pslld" },
+  pshiftq = { nil, nil, "psrlq", nil, nil, nil, "psllq" },
+  pshiftdq = { nil, nil, "psrlq", "psrldq", nil, nil, "psllq", "pslldq" },
+  fxsave = { "$fxsave", "$fxrstor", "$ldmxcsr", "$stmxcsr",
+	     nil, "lfenceDp$", "mfenceDp$", "sfenceDp$clflush" },
+  prefetch = { "prefetch", "prefetchw" },
+  prefetcht = { "prefetchnta", "prefetcht0", "prefetcht1", "prefetcht2" },
+}
+
+------------------------------------------------------------------------------
+
+-- Maps for register names.
+local map_regs = {
+  B = { "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh",
+	"r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b" },
+  B64 = { "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil",
+	  "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b" },
+  W = { "ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
+	"r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w" },
+  D = { "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
+	"r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" },
+  Q = { "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
+	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" },
+  M = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
+	"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }, -- No x64 ext!
+  X = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+	"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" },
+}
+local map_segregs = { "es", "cs", "ss", "ds", "fs", "gs", "segr6", "segr7" }
+
+-- Maps for size names.
+local map_sz2n = {
+  B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16,
+}
+local map_sz2prefix = {
+  B = "byte", W = "word", D = "dword",
+  Q = "qword",
+  M = "qword", X = "xword",
+  F = "dword", G = "qword", -- No need for sizes/register names for these two.
+}
+
+------------------------------------------------------------------------------
+
+-- Output a nicely formatted line with an opcode and operands.
+local function putop(ctx, text, operands)
+  local code, pos, hex = ctx.code, ctx.pos, ""
+  local hmax = ctx.hexdump
+  if hmax > 0 then
+    for i=ctx.start,pos-1 do
+      hex = hex..format("%02X", byte(code, i, i))
+    end
+    if #hex > hmax then hex = sub(hex, 1, hmax)..". "
+    else hex = hex..rep(" ", hmax-#hex+2) end
+  end
+  if operands then text = text.." "..operands end
+  if ctx.o16 then text = "o16 "..text; ctx.o16 = false end
+  if ctx.a32 then text = "a32 "..text; ctx.a32 = false end
+  if ctx.rep then text = ctx.rep.." "..text; ctx.rep = false end
+  if ctx.rex then
+    local t = (ctx.rexw and "w" or "")..(ctx.rexr and "r" or "")..
+	      (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")
+    if t ~= "" then text = "rex."..t.." "..text end
+    ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
+    ctx.rex = false
+  end
+  if ctx.seg then
+    local text2, n = gsub(text, "%[", "["..ctx.seg..":")
+    if n == 0 then text = ctx.seg.." "..text else text = text2 end
+    ctx.seg = false
+  end
+  if ctx.lock then text = "lock "..text; ctx.lock = false end
+  local imm = ctx.imm
+  if imm then
+    local sym = ctx.symtab[imm]
+    if sym then text = text.."\t->"..sym end
+  end
+  ctx.out(format("%08x  %s%s\n", ctx.addr+ctx.start, hex, text))
+  ctx.mrm = false
+  ctx.start = pos
+  ctx.imm = nil
+end
+
+-- Clear all prefix flags.
+local function clearprefixes(ctx)
+  ctx.o16 = false; ctx.seg = false; ctx.lock = false; ctx.rep = false
+  ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
+  ctx.rex = false; ctx.a32 = false
+end
+
+-- Fallback for incomplete opcodes at the end.
+local function incomplete(ctx)
+  ctx.pos = ctx.stop+1
+  clearprefixes(ctx)
+  return putop(ctx, "(incomplete)")
+end
+
+-- Fallback for unknown opcodes.
+local function unknown(ctx)
+  clearprefixes(ctx)
+  return putop(ctx, "(unknown)")
+end
+
+-- Return an immediate of the specified size.
+local function getimm(ctx, pos, n)
+  if pos+n-1 > ctx.stop then return incomplete(ctx) end
+  local code = ctx.code
+  if n == 1 then
+    local b1 = byte(code, pos, pos)
+    return b1
+  elseif n == 2 then
+    local b1, b2 = byte(code, pos, pos+1)
+    return b1+b2*256
+  else
+    local b1, b2, b3, b4 = byte(code, pos, pos+3)
+    local imm = b1+b2*256+b3*65536+b4*16777216
+    ctx.imm = imm
+    return imm
+  end
+end
+
+-- Process pattern string and generate the operands.
+local function putpat(ctx, name, pat)
+  local operands, regs, sz, mode, sp, rm, sc, rx, sdisp
+  local code, pos, stop = ctx.code, ctx.pos, ctx.stop
+
+  -- Chars used: 1DFGIMPQRSTUVWXacdfgijmoprstuwxyz
+  for p in gmatch(pat, ".") do
+    local x = nil
+    if p == "V" or p == "U" then
+      if ctx.rexw then sz = "Q"; ctx.rexw = false
+      elseif ctx.o16 then sz = "W"; ctx.o16 = false
+      elseif p == "U" and ctx.x64 then sz = "Q"
+      else sz = "D" end
+      regs = map_regs[sz]
+    elseif p == "T" then
+      if ctx.rexw then sz = "Q"; ctx.rexw = false else sz = "D" end
+      regs = map_regs[sz]
+    elseif p == "B" then
+      sz = "B"
+      regs = ctx.rex and map_regs.B64 or map_regs.B
+    elseif match(p, "[WDQMXFG]") then
+      sz = p
+      regs = map_regs[sz]
+    elseif p == "P" then
+      sz = ctx.o16 and "X" or "M"; ctx.o16 = false
+      regs = map_regs[sz]
+    elseif p == "S" then
+      name = name..lower(sz)
+    elseif p == "s" then
+      local imm = getimm(ctx, pos, 1); if not imm then return end
+      x = imm <= 127 and format("+0x%02x", imm)
+		     or format("-0x%02x", 256-imm)
+      pos = pos+1
+    elseif p == "u" then
+      local imm = getimm(ctx, pos, 1); if not imm then return end
+      x = format("0x%02x", imm)
+      pos = pos+1
+    elseif p == "w" then
+      local imm = getimm(ctx, pos, 2); if not imm then return end
+      x = format("0x%x", imm)
+      pos = pos+2
+    elseif p == "o" then -- [offset]
+      if ctx.x64 then
+	local imm1 = getimm(ctx, pos, 4); if not imm1 then return end
+	local imm2 = getimm(ctx, pos+4, 4); if not imm2 then return end
+	x = format("[0x%08x%08x]", imm2, imm1)
+	pos = pos+8
+      else
+	local imm = getimm(ctx, pos, 4); if not imm then return end
+	x = format("[0x%08x]", imm)
+	pos = pos+4
+      end
+    elseif p == "i" or p == "I" then
+      local n = map_sz2n[sz]
+      if n == 8 and ctx.x64 and p == "I" then
+	local imm1 = getimm(ctx, pos, 4); if not imm1 then return end
+	local imm2 = getimm(ctx, pos+4, 4); if not imm2 then return end
+	x = format("0x%08x%08x", imm2, imm1)
+      else
+	if n == 8 then n = 4 end
+	local imm = getimm(ctx, pos, n); if not imm then return end
+	if sz == "Q" and (imm < 0 or imm > 0x7fffffff) then
+	  imm = (0xffffffff+1)-imm
+	  x = format(imm > 65535 and "-0x%08x" or "-0x%x", imm)
+	else
+	  x = format(imm > 65535 and "0x%08x" or "0x%x", imm)
+	end
+      end
+      pos = pos+n
+    elseif p == "j" then
+      local n = map_sz2n[sz]
+      if n == 8 then n = 4 end
+      local imm = getimm(ctx, pos, n); if not imm then return end
+      if sz == "B" and imm > 127 then imm = imm-256
+      elseif imm > 2147483647 then imm = imm-4294967296 end
+      pos = pos+n
+      imm = imm + pos + ctx.addr
+      if imm > 4294967295 and not ctx.x64 then imm = imm-4294967296 end
+      ctx.imm = imm
+      if sz == "W" then
+	x = format("word 0x%04x", imm%65536)
+      elseif ctx.x64 then
+	local lo = imm % 0x1000000
+	x = format("0x%02x%06x", (imm-lo) / 0x1000000, lo)
+      else
+	x = format("0x%08x", imm)
+      end
+    elseif p == "R" then
+      local r = byte(code, pos-1, pos-1)%8
+      if ctx.rexb then r = r + 8; ctx.rexb = false end
+      x = regs[r+1]
+    elseif p == "a" then x = regs[1]
+    elseif p == "c" then x = "cl"
+    elseif p == "d" then x = "dx"
+    elseif p == "1" then x = "1"
+    else
+      if not mode then
+	mode = ctx.mrm
+	if not mode then
+	  if pos > stop then return incomplete(ctx) end
+	  mode = byte(code, pos, pos)
+	  pos = pos+1
+	end
+	rm = mode%8; mode = (mode-rm)/8
+	sp = mode%8; mode = (mode-sp)/8
+	sdisp = ""
+	if mode < 3 then
+	  if rm == 4 then
+	    if pos > stop then return incomplete(ctx) end
+	    sc = byte(code, pos, pos)
+	    pos = pos+1
+	    rm = sc%8; sc = (sc-rm)/8
+	    rx = sc%8; sc = (sc-rx)/8
+	    if ctx.rexx then rx = rx + 8; ctx.rexx = false end
+	    if rx == 4 then rx = nil end
+	  end
+	  if mode > 0 or rm == 5 then
+	    local dsz = mode
+	    if dsz ~= 1 then dsz = 4 end
+	    local disp = getimm(ctx, pos, dsz); if not disp then return end
+	    if mode == 0 then rm = nil end
+	    if rm or rx or (not sc and ctx.x64 and not ctx.a32) then
+	      if dsz == 1 and disp > 127 then
+		sdisp = format("-0x%x", 256-disp)
+	      elseif disp >= 0 and disp <= 0x7fffffff then
+		sdisp = format("+0x%x", disp)
+	      else
+		sdisp = format("-0x%x", (0xffffffff+1)-disp)
+	      end
+	    else
+	      sdisp = format(ctx.x64 and not ctx.a32 and
+		not (disp >= 0 and disp <= 0x7fffffff)
+		and "0xffffffff%08x" or "0x%08x", disp)
+	    end
+	    pos = pos+dsz
+	  end
+	end
+	if rm and ctx.rexb then rm = rm + 8; ctx.rexb = false end
+	if ctx.rexr then sp = sp + 8; ctx.rexr = false end
+      end
+      if p == "m" then
+	if mode == 3 then x = regs[rm+1]
+	else
+	  local aregs = ctx.a32 and map_regs.D or ctx.aregs
+	  local srm, srx = "", ""
+	  if rm then srm = aregs[rm+1]
+	  elseif not sc and ctx.x64 and not ctx.a32 then srm = "rip" end
+	  ctx.a32 = false
+	  if rx then
+	    if rm then srm = srm.."+" end
+	    srx = aregs[rx+1]
+	    if sc > 0 then srx = srx.."*"..(2^sc) end
+	  end
+	  x = format("[%s%s%s]", srm, srx, sdisp)
+	end
+	if mode < 3 and
+	   (not match(pat, "[aRrgp]") or match(pat, "t")) then -- Yuck.
+	  x = map_sz2prefix[sz].." "..x
+	end
+      elseif p == "r" then x = regs[sp+1]
+      elseif p == "g" then x = map_segregs[sp+1]
+      elseif p == "p" then -- Suppress prefix.
+      elseif p == "f" then x = "st"..rm
+      elseif p == "x" then
+	if sp == 0 and ctx.lock and not ctx.x64 then
+	  x = "CR8"; ctx.lock = false
+	else
+	  x = "CR"..sp
+	end
+      elseif p == "y" then x = "DR"..sp
+      elseif p == "z" then x = "TR"..sp
+      elseif p == "t" then
+      else
+	error("bad pattern `"..pat.."'")
+      end
+    end
+    if x then operands = operands and operands..", "..x or x end
+  end
+  ctx.pos = pos
+  return putop(ctx, name, operands)
+end
+
+-- Forward declaration.
+local map_act
+
+-- Fetch and cache MRM byte.
+local function getmrm(ctx)
+  local mrm = ctx.mrm
+  if not mrm then
+    local pos = ctx.pos
+    if pos > ctx.stop then return nil end
+    mrm = byte(ctx.code, pos, pos)
+    ctx.pos = pos+1
+    ctx.mrm = mrm
+  end
+  return mrm
+end
+
+-- Dispatch to handler depending on pattern.
+local function dispatch(ctx, opat, patgrp)
+  if not opat then return unknown(ctx) end
+  if match(opat, "%|") then -- MMX/SSE variants depending on prefix.
+    local p
+    if ctx.rep then
+      p = ctx.rep=="rep" and "%|([^%|]*)" or "%|[^%|]*%|[^%|]*%|([^%|]*)"
+      ctx.rep = false
+    elseif ctx.o16 then p = "%|[^%|]*%|([^%|]*)"; ctx.o16 = false
+    else p = "^[^%|]*" end
+    opat = match(opat, p)
+    if not opat then return unknown(ctx) end
+--    ctx.rep = false; ctx.o16 = false
+    --XXX fails for 66 f2 0f 38 f1 06  crc32 eax,WORD PTR [esi]
+    --XXX remove in branches?
+  end
+  if match(opat, "%$") then -- reg$mem variants.
+    local mrm = getmrm(ctx); if not mrm then return incomplete(ctx) end
+    opat = match(opat, mrm >= 192 and "^[^%$]*" or "%$(.*)")
+    if opat == "" then return unknown(ctx) end
+  end
+  if opat == "" then return unknown(ctx) end
+  local name, pat = match(opat, "^([a-z0-9 ]*)(.*)")
+  if pat == "" and patgrp then pat = patgrp end
+  return map_act[sub(pat, 1, 1)](ctx, name, pat)
+end
+
+-- Get a pattern from an opcode map and dispatch to handler.
+local function dispatchmap(ctx, opcmap)
+  local pos = ctx.pos
+  local opat = opcmap[byte(ctx.code, pos, pos)]
+  pos = pos + 1
+  ctx.pos = pos
+  return dispatch(ctx, opat)
+end
+
+-- Map for action codes. The key is the first char after the name.
+map_act = {
+  -- Simple opcodes without operands.
+  [""] = function(ctx, name, pat)
+    return putop(ctx, name)
+  end,
+
+  -- Operand size chars fall right through.
+  B = putpat, W = putpat, D = putpat, Q = putpat,
+  V = putpat, U = putpat, T = putpat,
+  M = putpat, X = putpat, P = putpat,
+  F = putpat, G = putpat,
+
+  -- Collect prefixes.
+  [":"] = function(ctx, name, pat)
+    ctx[pat == ":" and name or sub(pat, 2)] = name
+    if ctx.pos - ctx.start > 5 then return unknown(ctx) end -- Limit #prefixes.
+  end,
+
+  -- Chain to special handler specified by name.
+  ["*"] = function(ctx, name, pat)
+    return map_act[name](ctx, name, sub(pat, 2))
+  end,
+
+  -- Use named subtable for opcode group.
+  ["!"] = function(ctx, name, pat)
+    local mrm = getmrm(ctx); if not mrm then return incomplete(ctx) end
+    return dispatch(ctx, map_opcgroup[name][((mrm-(mrm%8))/8)%8+1], sub(pat, 2))
+  end,
+
+  -- o16,o32[,o64] variants.
+  sz = function(ctx, name, pat)
+    if ctx.o16 then ctx.o16 = false
+    else
+      pat = match(pat, ",(.*)")
+      if ctx.rexw then
+	local p = match(pat, ",(.*)")
+	if p then pat = p; ctx.rexw = false end
+      end
+    end
+    pat = match(pat, "^[^,]*")
+    return dispatch(ctx, pat)
+  end,
+
+  -- Two-byte opcode dispatch.
+  opc2 = function(ctx, name, pat)
+    return dispatchmap(ctx, map_opc2)
+  end,
+
+  -- Three-byte opcode dispatch.
+  opc3 = function(ctx, name, pat)
+    return dispatchmap(ctx, map_opc3[pat])
+  end,
+
+  -- VMX/SVM dispatch.
+  vm = function(ctx, name, pat)
+    return dispatch(ctx, map_opcvm[ctx.mrm])
+  end,
+
+  -- Floating point opcode dispatch.
+  fp = function(ctx, name, pat)
+    local mrm = getmrm(ctx); if not mrm then return incomplete(ctx) end
+    local rm = mrm%8
+    local idx = pat*8 + ((mrm-rm)/8)%8
+    if mrm >= 192 then idx = idx + 64 end
+    local opat = map_opcfp[idx]
+    if type(opat) == "table" then opat = opat[rm+1] end
+    return dispatch(ctx, opat)
+  end,
+
+  -- REX prefix.
+  rex = function(ctx, name, pat)
+    if ctx.rex then return unknown(ctx) end -- Only 1 REX prefix allowed.
+    for p in gmatch(pat, ".") do ctx["rex"..p] = true end
+    ctx.rex = true
+  end,
+
+  -- Special case for nop with REX prefix.
+  nop = function(ctx, name, pat)
+    return dispatch(ctx, ctx.rex and pat or "nop")
+  end,
+}
+
+------------------------------------------------------------------------------
+
+-- Disassemble a block of code.
+local function disass_block(ctx, ofs, len)
+  if not ofs then ofs = 0 end
+  local stop = len and ofs+len or #ctx.code
+  ofs = ofs + 1
+  ctx.start = ofs
+  ctx.pos = ofs
+  ctx.stop = stop
+  ctx.imm = nil
+  ctx.mrm = false
+  clearprefixes(ctx)
+  while ctx.pos <= stop do dispatchmap(ctx, ctx.map1) end
+  if ctx.pos ~= ctx.start then incomplete(ctx) end
+end
+
+-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
+local function create_(code, addr, out)
+  local ctx = {}
+  ctx.code = code
+  ctx.addr = (addr or 0) - 1
+  ctx.out = out or io.write
+  ctx.symtab = {}
+  ctx.disass = disass_block
+  ctx.hexdump = 16
+  ctx.x64 = false
+  ctx.map1 = map_opc1_32
+  ctx.aregs = map_regs.D
+  return ctx
+end
+
+local function create64_(code, addr, out)
+  local ctx = create_(code, addr, out)
+  ctx.x64 = true
+  ctx.map1 = map_opc1_64
+  ctx.aregs = map_regs.Q
+  return ctx
+end
+
+-- Simple API: disassemble code (a string) at address and output via out.
+local function disass_(code, addr, out)
+  create_(code, addr, out):disass()
+end
+
+local function disass64_(code, addr, out)
+  create64_(code, addr, out):disass()
+end
+
+
+-- Public module functions.
+module(...)
+
+create = create_
+create64 = create64_
+disass = disass_
+disass64 = disass64_
+

+ 567 - 0
lib/dump.lua

@@ -0,0 +1,567 @@
+----------------------------------------------------------------------------
+-- LuaJIT compiler dump module.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- Released under the MIT/X license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+--
+-- This module can be used to debug the JIT compiler itself. It dumps the
+-- code representations and structures used in various compiler stages.
+--
+-- Example usage:
+--
+--   luajit -jdump -e "local x=0; for i=1,1e6 do x=x+i end; print(x)"
+--   luajit -jdump=im -e "for i=1,1000 do for j=1,1000 do end end" | less -R
+--   luajit -jdump=is myapp.lua | less -R
+--   luajit -jdump=-b myapp.lua
+--   luajit -jdump=+aH,myapp.html myapp.lua
+--   luajit -jdump=ixT,myapp.dump myapp.lua
+--
+-- The first argument specifies the dump mode. The second argument gives
+-- the output file name. Default output is to stdout, unless the environment
+-- variable LUAJIT_DUMPFILE is set. The file is overwritten every time the
+-- module is started.
+--
+-- Different features can be turned on or off with the dump mode. If the
+-- mode starts with a '+', the following features are added to the default
+-- set of features; a '-' removes them. Otherwise the features are replaced.
+--
+-- The following dump features are available (* marks the default):
+--
+--  * t  Print a line for each started, ended or aborted trace (see also -jv).
+--  * b  Dump the traced bytecode.
+--  * i  Dump the IR (intermediate representation).
+--    r  Augment the IR with register/stack slots.
+--    s  Dump the snapshot map.
+--  * m  Dump the generated machine code.
+--    x  Print each taken trace exit.
+--    X  Print each taken trace exit and the contents of all registers.
+--
+-- The output format can be set with the following characters:
+--
+--    T  Plain text output.
+--    A  ANSI-colored text output
+--    H  Colorized HTML + CSS output.
+--
+-- The default output format is plain text. It's set to ANSI-colored text
+-- if the COLORTERM variable is set. Note: this is independent of any output
+-- redirection, which is actually considered a feature.
+--
+-- You probably want to use less -R to enjoy viewing ANSI-colored text from
+-- a pipe or a file. Add this to your ~/.bashrc: export LESS="-R"
+--
+------------------------------------------------------------------------------
+
+-- Cache some library functions and objects.
+local jit = require("jit")
+assert(jit.version_num == 20000, "LuaJIT core/library version mismatch")
+local jutil = require("jit.util")
+local vmdef = require("jit.vmdef")
+local funcinfo, funcbc = jutil.funcinfo, jutil.funcbc
+local traceinfo, traceir, tracek = jutil.traceinfo, jutil.traceir, jutil.tracek
+local tracemc, traceexitstub = jutil.tracemc, jutil.traceexitstub
+local tracesnap = jutil.tracesnap
+local bit = require("bit")
+local band, shl, shr = bit.band, bit.lshift, bit.rshift
+local sub, gsub, format = string.sub, string.gsub, string.format
+local byte, char, rep = string.byte, string.char, string.rep
+local type, tostring = type, tostring
+local stdout, stderr = io.stdout, io.stderr
+
+-- Load other modules on-demand.
+local bcline, discreate
+
+-- Active flag, output file handle and dump mode.
+local active, out, dumpmode
+
+------------------------------------------------------------------------------
+
+local symtab = {}
+local nexitsym = 0
+
+-- Fill symbol table with trace exit addresses.
+local function fillsymtab(nexit)
+  local t = symtab
+  if nexit > nexitsym then
+    for i=nexitsym,nexit-1 do t[traceexitstub(i)] = tostring(i) end
+    nexitsym = nexit
+  end
+  return t
+end
+
+local function dumpwrite(s)
+  out:write(s)
+end
+
+-- Disassemble machine code.
+local function dump_mcode(tr)
+  local info = traceinfo(tr)
+  if not info then return end
+  local mcode, addr, loop = tracemc(tr)
+  if not mcode then return end
+  if not discreate then
+    discreate = require("jit.dis_"..jit.arch).create
+  end
+  out:write("---- TRACE ", tr, " mcode ", #mcode, "\n")
+  local ctx = discreate(mcode, addr, dumpwrite)
+  ctx.hexdump = 0
+  ctx.symtab = fillsymtab(info.nexit)
+  if loop ~= 0 then
+    symtab[addr+loop] = "LOOP"
+    ctx:disass(0, loop)
+    out:write("->LOOP:\n")
+    ctx:disass(loop, #mcode-loop)
+    symtab[addr+loop] = nil
+  else
+    ctx:disass(0, #mcode)
+  end
+end
+
+------------------------------------------------------------------------------
+
+local irtype_text = {
+  [0] = "nil",
+  "fal",
+  "tru",
+  "lud",
+  "str",
+  "ptr",
+  "thr",
+  "pro",
+  "fun",
+  "t09",
+  "tab",
+  "udt",
+  "num",
+  "int",
+  "i8 ",
+  "u8 ",
+  "i16",
+  "u16",
+}
+
+local colortype_ansi = {
+  [0] = "%s",
+  "%s",
+  "%s",
+  "%s",
+  "\027[32m%s\027[m",
+  "%s",
+  "\027[1m%s\027[m",
+  "%s",
+  "\027[1m%s\027[m",
+  "%s",
+  "\027[31m%s\027[m",
+  "\027[36m%s\027[m",
+  "\027[34m%s\027[m",
+  "\027[35m%s\027[m",
+  "\027[35m%s\027[m",
+  "\027[35m%s\027[m",
+  "\027[35m%s\027[m",
+  "\027[35m%s\027[m",
+}
+
+local function colorize_text(s, t)
+  return s
+end
+
+local function colorize_ansi(s, t)
+  return format(colortype_ansi[t], s)
+end
+
+local irtype_ansi = setmetatable({},
+  { __index = function(tab, t)
+      local s = colorize_ansi(irtype_text[t], t); tab[t] = s; return s; end })
+
+local html_escape = { ["<"] = "&lt;", [">"] = "&gt;", ["&"] = "&amp;", }
+
+local function colorize_html(s, t)
+  s = gsub(s, "[<>&]", html_escape)
+  return format('<span class="irt_%s">%s</span>', irtype_text[t], s)
+end
+
+local irtype_html = setmetatable({},
+  { __index = function(tab, t)
+      local s = colorize_html(irtype_text[t], t); tab[t] = s; return s; end })
+
+local header_html = [[
+<style type="text/css">
+background { background: #ffffff; color: #000000; }
+pre.ljdump {
+font-size: 10pt;
+background: #f0f4ff;
+color: #000000;
+border: 1px solid #bfcfff;
+padding: 0.5em;
+margin-left: 2em;
+margin-right: 2em;
+}
+span.irt_str { color: #00a000; }
+span.irt_thr, span.irt_fun { color: #404040; font-weight: bold; }
+span.irt_tab { color: #c00000; }
+span.irt_udt { color: #00c0c0; }
+span.irt_num { color: #0000c0; }
+span.irt_int { color: #c000c0; }
+</style>
+]]
+
+local colorize, irtype
+
+-- Lookup table to convert some literals into names.
+local litname = {
+  ["SLOAD "] = { [0] = "", "I", "R", "RI", "P", "PI", "PR", "PRI", },
+  ["XLOAD "] = { [0] = "", "unaligned", },
+  ["TOINT "] = { [0] = "check", "index", "", },
+  ["FLOAD "] = vmdef.irfield,
+  ["FREF  "] = vmdef.irfield,
+  ["FPMATH"] = vmdef.irfpm,
+}
+
+local function ctlsub(c)
+  if c == "\n" then return "\\n"
+  elseif c == "\r" then return "\\r"
+  elseif c == "\t" then return "\\t"
+  elseif c == "\r" then return "\\r"
+  else return format("\\%03d", byte(c))
+  end
+end
+
+local function formatk(tr, idx)
+  local k, t, slot = tracek(tr, idx)
+  local tn = type(k)
+  local s
+  if tn == "number" then
+    if k == 2^52+2^51 then
+      s = "bias"
+    else
+      s = format("%+.14g", k)
+    end
+  elseif tn == "string" then
+    s = format(#k > 20 and '"%.20s"~' or '"%s"', gsub(k, "%c", ctlsub))
+  elseif tn == "function" then
+    local fi = funcinfo(k)
+    if fi.ffid then
+      s = vmdef.ffnames[fi.ffid]
+    else
+      s = fi.loc
+    end
+  elseif tn == "table" then
+    s = format("{%p}", k)
+  elseif tn == "userdata" then
+    if t == 11 then
+      s = format("userdata:%p", k)
+    else
+      s = format("[%p]", k)
+      if s == "[0x00000000]" then s = "NULL" end
+    end
+  else
+    s = tostring(k) -- For primitives.
+  end
+  s = colorize(format("%-4s", s), t)
+  if slot then
+    s = format("%s @%d", s, slot)
+  end
+  return s
+end
+
+local function printsnap(tr, snap)
+  for i=1,#snap do
+    local ref = snap[i]
+    if not ref then
+      out:write("---- ")
+    elseif ref < 0 then
+      out:write(formatk(tr, ref), " ")
+    else
+      local m, ot, op1, op2 = traceir(tr, ref)
+      local t = band(ot, 15)
+      local sep = " "
+      if t == 8 then
+	local oidx = 6*shr(ot, 8)
+	local op = sub(vmdef.irnames, oidx+1, oidx+6)
+	if op == "FRAME " then
+	  sep = "|"
+	end
+      end
+      out:write(colorize(format("%04d", ref), t), sep)
+    end
+  end
+  out:write("]\n")
+end
+
+-- Dump snapshots (not interleaved with IR).
+local function dump_snap(tr)
+  out:write("---- TRACE ", tr, " snapshots\n")
+  for i=0,1000000000 do
+    local snap = tracesnap(tr, i)
+    if not snap then break end
+    out:write(format("#%-3d %04d [ ", i, snap[0]))
+    printsnap(tr, snap)
+  end
+end
+
+-- NYI: should really get the register map from the disassembler.
+local reg_map = {
+  [0] = "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
+  "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+}
+
+-- Return a register name or stack slot for a rid/sp location.
+local function ridsp_name(ridsp)
+  local rid = band(ridsp, 0xff)
+  if ridsp > 255 then return format("[%x]", shr(ridsp, 8)*4) end
+  if rid < 128 then return reg_map[rid] end
+  return ""
+end
+
+-- Dump IR and interleaved snapshots.
+local function dump_ir(tr, dumpsnap, dumpreg)
+  local info = traceinfo(tr)
+  if not info then return end
+  local nins = info.nins
+  out:write("---- TRACE ", tr, " IR\n")
+  local irnames = vmdef.irnames
+  local snapref = 65536
+  local snap, snapno
+  if dumpsnap then
+    snap = tracesnap(tr, 0)
+    snapref = snap[0]
+    snapno = 0
+  end
+  for ins=1,nins do
+    if ins >= snapref then
+      if dumpreg then
+	out:write(format("....              SNAP   #%-3d [ ", snapno))
+      else
+	out:write(format("....        SNAP   #%-3d [ ", snapno))
+      end
+      printsnap(tr, snap)
+      snapno = snapno + 1
+      snap = tracesnap(tr, snapno)
+      snapref = snap and snap[0] or 65536
+    end
+    local m, ot, op1, op2, ridsp = traceir(tr, ins)
+    local oidx, t = 6*shr(ot, 8), band(ot, 31)
+    local op = sub(irnames, oidx+1, oidx+6)
+    if op == "LOOP  " then
+      if dumpreg then
+	out:write(format("%04d ------------ LOOP ------------\n", ins))
+      else
+	out:write(format("%04d ------ LOOP ------------\n", ins))
+      end
+    elseif op ~= "NOP   " and (dumpreg or op ~= "RENAME") then
+      if dumpreg then
+	out:write(format("%04d %-5s ", ins, ridsp_name(ridsp)))
+      else
+	out:write(format("%04d ", ins))
+      end
+      out:write(format("%s%s %s %s ",
+		       band(ot, 64) == 0 and " " or ">",
+		       band(ot, 128) == 0 and " " or "+",
+		       irtype[t], op))
+      local m1 = band(m, 3)
+      if m1 ~= 3 then -- op1 != IRMnone
+	if op1 < 0 then
+	  out:write(formatk(tr, op1))
+	else
+	  out:write(format(m1 == 0 and "%04d" or "#%-3d", op1))
+	end
+	local m2 = band(m, 3*4)
+	if m2 ~= 3*4 then -- op2 != IRMnone
+	  if m2 == 1*4 then -- op2 == IRMlit
+	    local litn = litname[op]
+	    if litn and litn[op2] then
+	      out:write("  ", litn[op2])
+	    else
+	      out:write(format("  #%-3d", op2))
+	    end
+	  elseif op2 < 0 then
+	    out:write("  ", formatk(tr, op2))
+	  else
+	    out:write(format("  %04d", op2))
+	  end
+	end
+      end
+      out:write("\n")
+    end
+  end
+  if snap then
+    if dumpreg then
+      out:write(format("....              SNAP   #%-3d [ ", snapno))
+    else
+      out:write(format("....        SNAP   #%-3d [ ", snapno))
+    end
+    printsnap(tr, snap)
+  end
+end
+
+------------------------------------------------------------------------------
+
+local recprefix = ""
+local recdepth = 0
+
+-- Format trace error message.
+local function fmterr(err, info)
+  if type(err) == "number" then
+    if type(info) == "function" then
+      local fi = funcinfo(info)
+      if fi.ffid then
+	info = vmdef.ffnames[fi.ffid]
+      else
+	info = fi.loc
+      end
+    end
+    err = format(vmdef.traceerr[err], info)
+  end
+  return err
+end
+
+-- Dump trace states.
+local function dump_trace(what, tr, func, pc, otr, oex)
+  if what == "stop" or (what == "abort" and dumpmode.a) then
+    if dumpmode.i then dump_ir(tr, dumpmode.s, dumpmode.r and what == "stop")
+    elseif dumpmode.s then dump_snap(tr) end
+    if dumpmode.m then dump_mcode(tr) end
+  end
+  if what == "start" then
+    if dumpmode.H then out:write('<pre class="ljdump">\n') end
+    out:write("---- TRACE ", tr, " ", what)
+    if otr then out:write(" ", otr, "/", oex) end
+    local fi = funcinfo(func, pc)
+    out:write(" ", fi.loc, "\n")
+    recprefix = ""
+    reclevel = 0
+  elseif what == "stop" or what == "abort" then
+    out:write("---- TRACE ", tr, " ", what)
+    recprefix = nil
+    if what == "abort" then
+      local fi = funcinfo(func, pc)
+      out:write(" ", fi.loc, " -- ", fmterr(otr, oex), "\n")
+    else
+      local link = traceinfo(tr).link
+      if link == tr then
+	link = "loop"
+      elseif link == 0 then
+	link = "interpreter"
+      end
+      out:write(" -> ", link, "\n")
+    end
+    if dumpmode.H then out:write("</pre>\n\n") else out:write("\n") end
+  else
+    out:write("---- TRACE ", what, "\n\n")
+  end
+  out:flush()
+end
+
+-- Dump recorded bytecode.
+local function dump_record(tr, func, pc, depth, callee)
+  if depth ~= recdepth then
+    recdepth = depth
+    recprefix = rep(" .", depth)
+  end
+  local line = bcline(func, pc, recprefix)
+  if dumpmode.H then line = gsub(line, "[<>&]", html_escape) end
+  if type(callee) == "function" then
+    local fi = funcinfo(callee)
+    if fi.ffid then
+      out:write(sub(line, 1, -2), "  ; ", vmdef.ffnames[fi.ffid], "\n")
+    else
+      out:write(sub(line, 1, -2), "  ; ", fi.loc, "\n")
+    end
+  else
+    out:write(line)
+  end
+  if band(funcbc(func, pc), 0xff) < 16 then -- Write JMP for cond. ORDER BC
+    out:write(bcline(func, pc+1, recprefix))
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Dump taken trace exits.
+local function dump_texit(tr, ex, ngpr, nfpr, ...)
+  out:write("---- TRACE ", tr, " exit ", ex, "\n")
+  if dumpmode.X then
+    local regs = {...}
+    for i=1,ngpr do
+      out:write(format(" %08x", regs[i]))
+      if i % 8 == 0 then out:write("\n") end
+    end
+    for i=1,nfpr do
+      out:write(format(" %+17.14g", regs[ngpr+i]))
+      if i % 4 == 0 then out:write("\n") end
+    end
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Detach dump handlers.
+local function dumpoff()
+  if active then
+    active = false
+    jit.attach(dump_texit)
+    jit.attach(dump_record)
+    jit.attach(dump_trace)
+    if out and out ~= stdout and out ~= stderr then out:close() end
+    out = nil
+  end
+end
+
+-- Open the output file and attach dump handlers.
+local function dumpon(opt, outfile)
+  if active then dumpoff() end
+
+  local colormode = os.getenv("COLORTERM") and "A" or "T"
+  if opt then
+    opt = gsub(opt, "[TAH]", function(mode) colormode = mode; return ""; end)
+  end
+
+  local m = { t=true, b=true, i=true, m=true, }
+  if opt and opt ~= "" then
+    local o = sub(opt, 1, 1)
+    if o ~= "+" and o ~= "-" then m = {} end
+    for i=1,#opt do m[sub(opt, i, i)] = (o ~= "-") end
+  end
+  dumpmode = m
+
+  if m.t or m.b or m.i or m.s or m.m then
+    jit.attach(dump_trace, "trace")
+  end
+  if m.b then
+    jit.attach(dump_record, "record")
+    if not bcline then bcline = require("jit.bc").line end
+  end
+  if m.x or m.X then
+    jit.attach(dump_texit, "texit")
+  end
+
+  if not outfile then outfile = os.getenv("LUAJIT_DUMPFILE") end
+  if outfile then
+    out = outfile == "-" and stdout or assert(io.open(outfile, "w"))
+  else
+    out = stdout
+  end
+
+  m[colormode] = true
+  if colormode == "A" then
+    colorize = colorize_ansi
+    irtype = irtype_ansi
+  elseif colormode == "H" then
+    colorize = colorize_html
+    irtype = irtype_html
+    out:write(header_html)
+  else
+    colorize = colorize_text
+    irtype = irtype_text
+  end
+
+  active = true
+end
+
+-- Public module functions.
+module(...)
+
+on = dumpon
+off = dumpoff
+start = dumpon -- For -j command line option.
+

+ 156 - 0
lib/v.lua

@@ -0,0 +1,156 @@
+----------------------------------------------------------------------------
+-- Verbose mode of the LuaJIT compiler.
+--
+-- Copyright (C) 2005-2009 Mike Pall. All rights reserved.
+-- Released under the MIT/X license. See Copyright Notice in luajit.h
+----------------------------------------------------------------------------
+--
+-- This module shows verbose information about the progress of the
+-- JIT compiler. It prints one line for each generated trace. This module
+-- is useful to see which code has been compiled or where the compiler
+-- punts and falls back to the interpreter.
+--
+-- Example usage:
+--
+--   luajit -jv -e "for i=1,1000 do for j=1,1000 do end end"
+--   luajit -jv=myapp.out myapp.lua
+--
+-- Default output is to stderr. To redirect the output to a file, pass a
+-- filename as an argument (use '-' for stdout) or set the environment
+-- variable LUAJIT_VERBOSEFILE. The file is overwritten every time the
+-- module is started.
+--
+-- The output from the first example should look like this:
+--
+-- [TRACE   1 (command line):1]
+-- [TRACE   2 (1/3) (command line):1 -> 1]
+--
+-- The first number in each line is the internal trace number. Next are
+-- the file name ('(command line)') and the line number (':1') where the
+-- trace has started. Side traces also show the parent trace number and
+-- the exit number where they are attached to in parentheses ('(1/3)').
+-- An arrow at the end shows where the trace links to ('-> 1'), unless
+-- it loops to itself.
+--
+-- In this case the inner loop gets hot and is traced first, generating
+-- a root trace. Then the last exit from the 1st trace gets hot, too,
+-- and triggers generation of the 2nd trace. The side trace follows the
+-- path along the outer loop and *around* the inner loop, back to its
+-- start, and then links to the 1st trace. Yes, this may seem unusual,
+-- if you know how traditional compilers work. Trace compilers are full
+-- of surprises like this -- have fun! :-)
+--
+-- Aborted traces are shown like this:
+--
+-- [TRACE --- foo.lua:44 -- leaving loop in root trace at foo:lua:50]
+--
+-- Don't worry -- trace aborts are quite common, even in programs which
+-- can be fully compiled. The compiler may retry several times until it
+-- finds a suitable trace.
+--
+-- Of course this doesn't work with features that are not-yet-implemented
+-- (NYI error messages). The VM simply falls back to the interpreter. This
+-- may not matter at all if the particular trace is not very high up in
+-- the CPU usage profile. Oh, and the interpreter is quite fast, too.
+--
+-- Also check out the -jdump module, which prints all the gory details.
+--
+------------------------------------------------------------------------------
+
+-- Cache some library functions and objects.
+local jit = require("jit")
+assert(jit.version_num == 20000, "LuaJIT core/library version mismatch")
+local jutil = require("jit.util")
+local vmdef = require("jit.vmdef")
+local funcinfo, traceinfo = jutil.funcinfo, jutil.traceinfo
+local type, format = type, string.format
+local stdout, stderr = io.stdout, io.stderr
+
+-- Active flag and output file handle.
+local active, out
+
+------------------------------------------------------------------------------
+
+local startloc, startex
+
+-- Format trace error message.
+local function fmterr(err, info)
+  if type(err) == "number" then
+    if type(info) == "function" then
+      local fi = funcinfo(info)
+      if fi.ffid then
+	info = vmdef.ffnames[fi.ffid]
+      else
+	info = fi.loc
+      end
+    end
+    err = format(vmdef.traceerr[err], info)
+  end
+  return err
+end
+
+-- Dump trace states.
+local function dump_trace(what, tr, func, pc, otr, oex)
+  if what == "start" then
+    startloc = funcinfo(func, pc).loc
+    startex = otr and "("..otr.."/"..oex..") " or ""
+  else
+    if what == "abort" then
+      local loc = funcinfo(func, pc).loc
+      if loc ~= startloc then
+	out:write(format("[TRACE --- %s%s -- %s at %s]\n",
+	  startex, startloc, fmterr(otr, oex), loc))
+      else
+	out:write(format("[TRACE --- %s%s -- %s]\n",
+	  startex, startloc, fmterr(otr, oex)))
+      end
+    elseif what == "stop" then
+      local link = traceinfo(tr).link
+      if link == 0 then
+	out:write(format("[TRACE %3s %s%s -- fallback to interpreter]\n",
+	  tr, startex, startloc))
+      elseif link == tr then
+	out:write(format("[TRACE %3s %s%s]\n", tr, startex, startloc))
+      else
+	out:write(format("[TRACE %3s %s%s -> %d]\n",
+	  tr, startex, startloc, link))
+      end
+    else
+      out:write(format("[TRACE %s]\n", what))
+    end
+    out:flush()
+  end
+end
+
+------------------------------------------------------------------------------
+
+-- Detach dump handlers.
+local function dumpoff()
+  if active then
+    active = false
+    jit.attach(dump_trace)
+    if out and out ~= stdout and out ~= stderr then out:close() end
+    out = nil
+  end
+end
+
+-- Open the output file and attach dump handlers.
+local function dumpon(outfile)
+  if active then dumpoff() end
+  if not outfile then outfile = os.getenv("LUAJIT_VERBOSEFILE") end
+  if outfile then
+    out = outfile == "-" and stdout or assert(io.open(outfile, "w"))
+  else
+    out = stderr
+  end
+  jit.attach(dump_trace, "trace")
+  active = true
+end
+
+-- Public module functions.
+module(...)
+
+on = dumpon
+off = dumpoff
+start = dumpon -- For -j command line option.
+

+ 8 - 0
src/.gitignore

@@ -0,0 +1,8 @@
+luajit
+buildvm
+buildvm_*.h
+lj_ffdef.h
+lj_libdef.h
+lj_recdef.h
+lj_folddef.h
+lj_vm.s

+ 326 - 0
src/Makefile

@@ -0,0 +1,326 @@
+##############################################################################
+# LuaJIT Makefile. Requires GNU Make.
+#
+# Suitable for POSIX platforms (Linux, *BSD, OSX etc.).
+# Also works with MinGW and Cygwin on Windows.
+# Please check msvcbuild.bat for building with MSVC on Windows.
+#
+# Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+##############################################################################
+
+##############################################################################
+# Compiler options: change them as needed. This mainly affects the speed of
+# the JIT compiler itself, not the speed of the JIT compiled code.
+# Turn any of the optional settings on by removing the '#' in front of them.
+#
+# Note: LuaJIT can only be compiled for x86, and not for x64 (yet)!
+#       In the meantime, the x86 binary runs fine under a x64 OS.
+#
+# It's recommended to compile at least for i686. By default the assembler part
+# of the interpreter makes use of CMOV/FCOMI*/FUCOMI* instructions, anyway.
+CC= gcc -m32 -march=i686
+# Use this for GCC 4.2 or higher if you don't intend to distribute the
+# binaries to a different machine:
+#CC= gcc -m32 -march=native
+#
+# Since the assembler part does NOT maintain a frame pointer, it's pointless
+# to slow down the C part by not omitting it. Debugging and tracebacks are
+# not affected -- the assembler part has frame unwind information and GCC
+# emits it with -g (see CCDEBUG below).
+CCOPT= -O2 -fomit-frame-pointer
+# Use this if you want to generate a smaller binary (but it's slower):
+#CCOPT= -Os -fomit-frame-pointer
+# Note: it's no longer recommended to use -O3 with GCC 4.x.
+# The I-Cache bloat usually outweighs the benefits from aggressive inlining.
+#
+CCDEBUG=
+# Uncomment the next line to generate debug information:
+#CCDEBUG= -g
+#
+CCWARN= -Wall
+# Uncomment the next line to enable more warnings:
+#CCWARN+= -Wextra -Wdeclaration-after-statement -Wredundant-decls -Wshadow -Wpointer-arith
+#
+##############################################################################
+
+##############################################################################
+# Compile time definitions: change them as needed, but make sure you force
+# a full recompile with "make clean", followed by "make".
+# Note that most of these are NOT suitable for benchmarking or release mode!
+XCFLAGS=
+#
+# Disable the use of CMOV and FCOMI*/FUCOMI* instructions in the interpreter.
+# This is only necessary if you intend to run the code on REALLY ANCIENT CPUs
+# (before Pentium Pro, or on the VIA C3). This generally slows down the
+# interpreter. Don't bother if your OS wouldn't run on them, anyway.
+#XCFLAGS+= -DLUAJIT_CPU_NOCMOV
+#
+# Disable the JIT compiler, i.e. turn LuaJIT into a pure interpreter:
+#XCFLAGS+= -DLUAJIT_DISABLE_JIT
+#
+# Use the system provided memory allocator (realloc) instead of the
+# bundled memory allocator. This is slower, but sometimes helpful for
+# debugging. It's mandatory for Valgrind's memcheck tool, too.
+#XCFLAGS+= -DLUAJIT_USE_SYSMALLOC
+#
+# This define is required to run LuaJIT under Valgrind. The Valgrind
+# header files must be installed. You should enable debug information, too.
+#XCFLAGS+= -DLUAJIT_USE_VALGRIND
+#
+# This is the client for the GDB JIT API. GDB 7.0 or higher is required
+# to make use of it. See lj_gdbjit.c for details. Enabling this causes
+# a non-negligible overhead, even when not running under GDB.
+#XCFLAGS+= -DLUAJIT_USE_GDBJIT
+#
+# Turn on assertions for the Lua/C API to debug problems with lua_* calls.
+# This is rather slow -- use only while developing C libraries/embeddings.
+#XCFLAGS+= -DLUA_USE_APICHECK
+#
+# Turn on assertions for the whole LuaJIT VM. This significantly slows down
+# everything. Use only if you suspect a problem with LuaJIT itself.
+#XCFLAGS+= -DLUA_USE_ASSERT
+#
+##############################################################################
+# You probably don't need to change anything below this line.
+##############################################################################
+
+CCOPTIONS= $(CCDEBUG) $(CCOPT) $(CCWARN) $(CFLAGS) $(XCFLAGS)
+LDOPTIONS= $(CCDEBUG) $(LDFLAGS)
+
+HOST_CC= $(CC)
+HOST_RM= rm -f
+HOST_XCFLAGS=
+HOST_XLDFLAGS=
+HOST_XLIBS=
+
+TARGET_CC= $(CC)
+TARGET_STRIP= strip
+TARGET_XCFLAGS= -D_FILE_OFFSET_BITS=64
+TARGET_XLDFLAGS=
+TARGET_XSHLDFLAGS= -shared
+TARGET_XLIBS=
+TARGET_ARCH= $(patsubst %,-DLUAJIT_TARGET=LUAJIT_ARCH_%,$(TARGET))
+TARGET_DISABLE= -U_FORTIFY_SOURCE
+ifneq (,$(findstring stack-protector,$(shell $(CC) -dumpspecs)))
+  TARGET_DISABLE+= -fno-stack-protector
+endif
+
+ifneq (,$(findstring Windows,$(OS)))
+  TARGET_SYS= Windows
+else
+  TARGET_SYS:= $(shell uname -s)
+  ifneq (,$(findstring CYGWIN,$(TARGET_SYS)))
+    TARGET_SYS= Windows
+  endif
+endif
+
+ifeq (Linux,$(TARGET_SYS))
+  TARGET_XLIBS= -ldl
+  TARGET_XLDFLAGS= -Wl,-E
+else
+ifeq (Windows,$(TARGET_SYS))
+  HOST_RM= del
+  TARGET_STRIP= strip --strip-unneeded
+else
+ifeq (Darwin,$(TARGET_SYS))
+  TARGET_XSHLDFLAGS= -dynamiclib -single_module -undefined dynamic_lookup
+  TARGET_STRIP= strip -x
+  export MACOSX_DEPLOYMENT_TARGET=10.3
+else
+  TARGET_XLDFLAGS= -Wl,-E
+endif
+endif
+endif
+
+# NOTE: The LuaJIT distribution comes with a pre-generated buildvm_*.h.
+# You DO NOT NEED an installed copy of (plain) Lua 5.1 to run DynASM unless
+# you want to MODIFY the corresponding *.dasc file. You can also use LuaJIT
+# itself (bootstrapped from the pre-generated file) to run DynASM of course.
+DASM_LUA= lua
+
+Q= @
+E= @echo
+#Q=
+#E= @:
+
+##############################################################################
+
+TARGET_CFLAGS= $(CCOPTIONS) $(TARGET_DISABLE) $(TARGET_XCFLAGS)
+TARGET_LDFLAGS= $(LDOPTIONS) $(TARGET_XLDFLAGS)
+TARGET_SHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS)
+TARGET_LIBS= -lm $(TARGET_XLIBS)
+ifneq (,$(CCDEBUG))
+  TARGET_STRIP= @:
+endif
+
+HOST_CFLAGS= $(CCOPTIONS) $(HOST_XCFLAGS) $(TARGET_ARCH)
+HOST_LDFLAGS= $(LDOPTIONS) $(HOST_XLDFLAGS)
+HOST_LIBS= $(HOST_XLIBS)
+
+DASM_DIR= ../dynasm
+DASM= $(DASM_LUA) $(DASM_DIR)/dynasm.lua
+DASM_FLAGS=
+DASM_DISTFLAGS= -LN
+
+BUILDVM_O= buildvm.o buildvm_asm.o buildvm_peobj.o buildvm_lib.o buildvm_fold.o
+BUILDVM_T= buildvm
+
+HOST_O= $(BUILDVM_O)
+HOST_T= $(BUILDVM_T)
+
+LJVM_S= lj_vm.s
+LJVM_O= lj_vm.o
+LJVM_BOUT= $(LJVM_S)
+LJVM_MODE= asm
+
+LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_table.o \
+	 lib_io.o lib_os.o lib_package.o lib_debug.o lib_jit.o
+LJLIB_C= $(LJLIB_O:.o=.c)
+
+LJCORE_O= lj_gc.o lj_err.o lj_ctype.o lj_bc.o lj_obj.o \
+	  lj_str.o lj_tab.o lj_func.o lj_udata.o lj_meta.o \
+	  lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \
+	  lj_lex.o lj_parse.o \
+	  lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \
+	  lj_opt_dce.o lj_opt_loop.o \
+	  lj_mcode.o lj_snap.o lj_record.o lj_asm.o lj_trace.o lj_gdbjit.o \
+	  lj_lib.o lj_alloc.o lib_aux.o \
+	  $(LJLIB_O) lib_init.o
+
+LJVMCORE_O= $(LJVM_O) $(LJCORE_O)
+
+# NYI: Need complete support for building as a shared library on POSIX.
+#      This is currently *only* suitable for MinGW and Cygwin, see below.
+LUAJIT_O= luajit.o
+LUAJIT_SO= luajit.so
+LUAJIT_T= luajit
+
+LIB_VMDEF= ../lib/vmdef.lua
+
+TARGET_DEP= $(LIB_VMDEF)
+TARGET_O= $(LJVMCORE_O) $(LUAJIT_O)
+TARGET_T= $(LUAJIT_T)
+
+ALL_GEN= $(LJVM_S) lj_ffdef.h lj_libdef.h lj_recdef.h $(LIB_VMDEF) lj_folddef.h
+ALL_DYNGEN= buildvm_x86.h
+WIN_RM= *.obj *.lib *.exp *.dll *.exe *.manifest
+ALL_RM= $(LUAJIT_T) $(LUAJIT_SO) $(HOST_T) $(ALL_GEN) *.o $(WIN_RM)
+
+ifeq (Windows,$(TARGET_SYS))
+  LJVM_BOUT= $(LJVM_O)
+  LJVM_MODE= peobj
+  LIB_VMDEF= ..\lib\vmdef.lua
+  # Imported symbols are bound to a specific DLL name under Windows.
+  LUAJIT_SO= lua51.dll
+  LUAJIT_T= luajit.exe
+  BUILDVM_T= buildvm.exe
+  #
+  # You can comment out the following two lines to build a static executable.
+  # But then you won't be able to dynamically load any C modules, because
+  # they bind to lua51.dll.
+  #
+  TARGET_XCFLAGS+= -DLUA_BUILD_AS_DLL
+  TARGET_O= $(LUAJIT_SO) $(LUAJIT_O)
+endif
+
+##############################################################################
+
+default: $(TARGET_T)
+
+all:	$(TARGET_T)
+
+amalg:
+	@grep "^[+|]" ljamalg.c
+	$(MAKE) all "LJCORE_O=ljamalg.o"
+
+MAKE_TARGETS= amalg
+
+##############################################################################
+
+buildvm_x86.h: buildvm_x86.dasc
+	$(E) "DYNASM    $@"
+	$(Q)$(DASM) $(DASM_FLAGS) -o $@ buildvm_x86.dasc
+
+$(BUILDVM_T): $(BUILDVM_O)
+	$(E) "HOSTLINK  $@"
+	$(Q)$(HOST_CC) $(HOST_LDFLAGS) -o $@ $(BUILDVM_O) $(HOST_LIBS)
+
+$(LJVM_BOUT): $(BUILDVM_T)
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m $(LJVM_MODE) -o $@
+
+lj_ffdef.h: $(BUILDVM_T) $(LJLIB_C)
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m ffdef -o $@ $(LJLIB_C)
+
+lj_libdef.h: $(BUILDVM_T) $(LJLIB_C)
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m libdef -o $@ $(LJLIB_C)
+
+lj_recdef.h: $(BUILDVM_T) $(LJLIB_C)
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m recdef -o $@ $(LJLIB_C)
+
+$(LIB_VMDEF): $(BUILDVM_T) $(LJLIB_C)
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m vmdef -o $@ $(LJLIB_C)
+
+lj_folddef.h: $(BUILDVM_T) lj_opt_fold.c
+	$(E) "BUILDVM   $@"
+	$(Q)./$(BUILDVM_T) -m folddef -o $@ lj_opt_fold.c
+
+$(LUAJIT_SO): $(LJVMCORE_O)
+	$(E) "LINK      $@"
+	$(Q)$(TARGET_CC) $(TARGET_SHLDFLAGS) -o $@ $(LJVMCORE_O) $(TARGET_LIBS)
+	$(Q)$(TARGET_STRIP) $@
+
+$(LUAJIT_T): $(TARGET_O) $(TARGET_DEP)
+	$(E) "LINK      $@"
+	$(Q)$(TARGET_CC) $(TARGET_LDFLAGS) -o $@ $(TARGET_O) $(TARGET_LIBS)
+	$(Q)$(TARGET_STRIP) $@
+	$(E) "OK        Successfully built LuaJIT"
+
+##############################################################################
+
+%.o: %.c
+	$(E) "CC        $@"
+	$(Q)$(TARGET_CC) $(TARGET_CFLAGS) -c -o $@ $<
+
+%.o: %.s
+	$(E) "ASM       $@"
+	$(Q)$(TARGET_CC) $(TARGET_CFLAGS) -c -o $@ $<
+
+$(HOST_O): %.o: %.c
+	$(E) "HOSTCC    $@"
+	$(Q)$(HOST_CC) $(HOST_CFLAGS) -c -o $@ $<
+
+include Makefile.dep
+
+##############################################################################
+
+clean:
+	$(HOST_RM) $(ALL_RM)
+
+cleaner:	clean
+	$(HOST_RM) $(ALL_DYNGEN)
+
+distclean:	clean
+	$(E) "DYNASM    $@"
+	$(Q)$(DASM) $(DASM_DISTFLAGS) -o buildvm_x86.h buildvm_x86.dasc
+
+depend:
+	@test -f lj_ffdef.h || touch lj_ffdef.h
+	@test -f lj_libdef.h || touch lj_libdef.h
+	@test -f lj_recdef.h || touch lj_recdef.h
+	@test -f lj_folddef.h || touch lj_folddef.h
+	@test -f buildvm_x86.h || touch buildvm_x86.h
+	@$(HOST_CC) $(HOST_CFLAGS) -MM *.c | sed "s|$(DASM_DIR)|\$$(DASM_DIR)|g" >Makefile.dep
+	@test -s lj_ffdef.h || $(HOST_RM) lj_ffdef.h
+	@test -s lj_libdef.h || $(HOST_RM) lj_libdef.h
+	@test -s lj_recdef.h || $(HOST_RM) lj_recdef.h
+	@test -s lj_folddef.h || $(HOST_RM) lj_folddef.h
+	@test -s buildvm_x86.h || $(HOST_RM) buildvm_x86.h
+
+.PHONY: default all $(MAKE_TARGETS) clean cleaner distclean depend
+
+##############################################################################

+ 139 - 0
src/Makefile.dep

@@ -0,0 +1,139 @@
+buildvm.o: buildvm.c lua.h luaconf.h luajit.h lj_obj.h lj_def.h lj_arch.h \
+  lj_gc.h lj_bc.h lj_ir.h lj_frame.h lj_dispatch.h lj_jit.h lj_target.h \
+  lj_target_x86.h buildvm.h $(DASM_DIR)/dasm_proto.h $(DASM_DIR)/dasm_x86.h \
+  buildvm_x86.h lj_traceerr.h
+buildvm_asm.o: buildvm_asm.c buildvm.h lj_def.h lua.h luaconf.h lj_arch.h \
+  lj_bc.h
+buildvm_fold.o: buildvm_fold.c lj_obj.h lua.h luaconf.h lj_def.h \
+  lj_arch.h lj_ir.h buildvm.h
+buildvm_lib.o: buildvm_lib.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_lib.h buildvm.h
+buildvm_peobj.o: buildvm_peobj.c buildvm.h lj_def.h lua.h luaconf.h \
+  lj_arch.h lj_bc.h
+lib_aux.o: lib_aux.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_alloc.h
+lib_base.o: lib_base.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h \
+  lj_meta.h lj_state.h lj_ff.h lj_ffdef.h lj_ctype.h lj_lib.h lj_libdef.h
+lib_bit.o: lib_bit.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_str.h lj_lib.h lj_libdef.h
+lib_debug.o: lib_debug.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_libdef.h
+lib_init.o: lib_init.c lua.h luaconf.h lauxlib.h lualib.h
+lib_io.o: lib_io.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_gc.h lj_ff.h lj_ffdef.h lj_lib.h \
+  lj_libdef.h
+lib_jit.o: lib_jit.c lua.h luaconf.h lauxlib.h lualib.h lj_arch.h \
+  lj_obj.h lj_def.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_ir.h \
+  lj_jit.h lj_iropt.h lj_dispatch.h lj_bc.h lj_vm.h lj_vmevent.h lj_lib.h \
+  luajit.h lj_libdef.h
+lib_math.o: lib_math.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_lib.h lj_libdef.h
+lib_os.o: lib_os.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
+  lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_libdef.h
+lib_package.o: lib_package.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_lib.h
+lib_string.o: lib_string.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_state.h \
+  lj_ff.h lj_ffdef.h lj_ctype.h lj_lib.h lj_libdef.h
+lib_table.o: lib_table.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
+  lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_tab.h lj_lib.h \
+  lj_libdef.h
+lj_alloc.o: lj_alloc.c lj_def.h lua.h luaconf.h lj_arch.h lj_alloc.h
+lj_api.o: lj_api.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_udata.h lj_meta.h \
+  lj_state.h lj_frame.h lj_bc.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h \
+  lj_traceerr.h lj_vm.h lj_lex.h lj_parse.h
+lj_asm.o: lj_asm.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_str.h lj_tab.h lj_ir.h lj_jit.h lj_iropt.h lj_mcode.h lj_trace.h \
+  lj_dispatch.h lj_bc.h lj_traceerr.h lj_snap.h lj_asm.h lj_vm.h \
+  lj_target.h lj_target_x86.h
+lj_bc.o: lj_bc.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_bc.h
+lj_ctype.o: lj_ctype.c lj_ctype.h lj_def.h lua.h luaconf.h
+lj_dispatch.o: lj_dispatch.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_err.h lj_errmsg.h lj_state.h lj_frame.h lj_bc.h lj_jit.h lj_ir.h \
+  lj_trace.h lj_dispatch.h lj_traceerr.h lj_vm.h luajit.h
+lj_err.o: lj_err.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_err.h \
+  lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h lj_frame.h lj_bc.h \
+  lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h lj_traceerr.h lj_vm.h
+lj_func.o: lj_func.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_func.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h lj_bc.h \
+  lj_traceerr.h lj_vm.h
+lj_gc.o: lj_gc.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_udata.h lj_meta.h \
+  lj_state.h lj_frame.h lj_bc.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h \
+  lj_traceerr.h lj_vm.h
+lj_gdbjit.o: lj_gdbjit.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_frame.h lj_bc.h lj_jit.h \
+  lj_ir.h lj_dispatch.h
+lj_ir.o: lj_ir.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_str.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h lj_dispatch.h lj_bc.h \
+  lj_traceerr.h
+lj_lex.o: lj_lex.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_lex.h lj_parse.h lj_ctype.h
+lj_lib.o: lj_lib.c lauxlib.h lua.h luaconf.h lj_obj.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_vm.h \
+  lj_lib.h
+lj_mcode.o: lj_mcode.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_jit.h lj_ir.h lj_mcode.h lj_trace.h lj_dispatch.h lj_bc.h \
+  lj_traceerr.h
+lj_meta.o: lj_meta.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_meta.h lj_bc.h lj_vm.h
+lj_obj.o: lj_obj.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h
+lj_opt_dce.o: lj_opt_dce.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_ir.h lj_jit.h lj_iropt.h
+lj_opt_fold.o: lj_opt_fold.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_str.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h lj_dispatch.h lj_bc.h \
+  lj_traceerr.h lj_vm.h lj_folddef.h
+lj_opt_loop.o: lj_opt_loop.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_iropt.h \
+  lj_trace.h lj_dispatch.h lj_bc.h lj_traceerr.h lj_snap.h lj_vm.h
+lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_tab.h lj_ir.h lj_jit.h lj_iropt.h
+lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \
+  lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
+  lj_dispatch.h lj_traceerr.h
+lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \
+  lj_bc.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h
+lj_record.o: lj_record.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_state.h lj_frame.h \
+  lj_bc.h lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
+  lj_dispatch.h lj_traceerr.h lj_record.h lj_snap.h lj_asm.h lj_vm.h \
+  lj_recdef.h
+lj_snap.o: lj_snap.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
+  lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h lj_target_x86.h
+lj_state.o: lj_state.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_meta.h \
+  lj_state.h lj_frame.h lj_bc.h lj_trace.h lj_jit.h lj_ir.h lj_dispatch.h \
+  lj_traceerr.h lj_vm.h lj_lex.h lj_alloc.h
+lj_str.o: lj_str.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_str.h lj_state.h lj_ctype.h
+lj_tab.o: lj_tab.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
+  lj_err.h lj_errmsg.h lj_tab.h
+lj_trace.o: lj_trace.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_frame.h lj_bc.h lj_state.h \
+  lj_ir.h lj_jit.h lj_iropt.h lj_mcode.h lj_trace.h lj_dispatch.h \
+  lj_traceerr.h lj_snap.h lj_gdbjit.h lj_record.h lj_asm.h lj_vm.h \
+  lj_vmevent.h lj_target.h lj_target_x86.h
+lj_udata.o: lj_udata.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_gc.h lj_udata.h
+lj_vmevent.o: lj_vmevent.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
+  lj_str.h lj_tab.h lj_state.h lj_dispatch.h lj_bc.h lj_jit.h lj_ir.h \
+  lj_vm.h lj_vmevent.h
+ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \
+  lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h \
+  lj_udata.h lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_trace.h lj_jit.h \
+  lj_ir.h lj_dispatch.h lj_traceerr.h lj_vm.h lj_err.c lj_ctype.c \
+  lj_ctype.h lj_bc.c lj_obj.c lj_str.c lj_tab.c lj_func.c lj_udata.c \
+  lj_meta.c lj_state.c lj_lex.h lj_alloc.h lj_dispatch.c luajit.h \
+  lj_vmevent.c lj_vmevent.h lj_api.c lj_parse.h lj_lex.c lj_parse.c \
+  lj_lib.c lj_lib.h lj_ir.c lj_iropt.h lj_opt_mem.c lj_opt_fold.c \
+  lj_folddef.h lj_opt_narrow.c lj_opt_dce.c lj_opt_loop.c lj_snap.h \
+  lj_mcode.c lj_mcode.h lj_snap.c lj_target.h lj_target_x86.h lj_record.c \
+  lj_ff.h lj_ffdef.h lj_record.h lj_asm.h lj_recdef.h lj_asm.c lj_trace.c \
+  lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c lualib.h \
+  lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \
+  lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_init.c
+luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h

+ 438 - 0
src/buildvm.c

@@ -0,0 +1,438 @@
+/*
+** LuaJIT VM builder.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** This is a tool to build the hand-tuned assembler code required for
+** LuaJIT's bytecode interpreter. It supports a variety of output formats
+** to feed different toolchains (see usage() below).
+**
+** This tool is not particularly optimized because it's only used while
+** _building_ LuaJIT. There's no point in distributing or installing it.
+** Only the object code generated by this tool is linked into LuaJIT.
+**
+** Caveat: some memory is not free'd, error handling is lazy.
+** It's a one-shot tool -- any effort fixing this would be wasted.
+*/
+
+#include "lua.h"
+#include "luajit.h"
+
+#ifdef LUA_USE_WIN
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_bc.h"
+#include "lj_ir.h"
+#include "lj_frame.h"
+#include "lj_dispatch.h"
+#include "lj_target.h"
+
+#include "buildvm.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* DynASM glue definitions. */
+#define Dst		ctx
+#define Dst_DECL	BuildCtx *ctx
+#define Dst_REF		(ctx->D)
+
+#include "../dynasm/dasm_proto.h"
+
+/* Glue macros for DynASM. */
+#define DASM_M_GROW(ctx, t, p, sz, need) \
+  do { \
+    size_t _sz = (sz), _need = (need); \
+    if (_sz < _need) { \
+      if (_sz < 16) _sz = 16; \
+      while (_sz < _need) _sz += _sz; \
+      (p) = (t *)realloc((p), _sz); \
+      if ((p) == NULL) exit(1); \
+      (sz) = _sz; \
+    } \
+  } while(0)
+
+#define DASM_M_FREE(ctx, p, sz)	free(p)
+
+static int collect_reloc(BuildCtx *ctx, uint8_t *addr, int idx, int type);
+
+#define DASM_EXTERN(ctx, addr, idx, type) \
+  collect_reloc(ctx, addr, idx, type)
+
+/* ------------------------------------------------------------------------ */
+
+/* Avoid trouble if cross-compiling for an x86 target. Speed doesn't matter. */
+#define DASM_ALIGNED_WRITES	1
+
+/* Embed architecture-specific DynASM encoder and backend. */
+#if LJ_TARGET_X86
+#include "../dynasm/dasm_x86.h"
+#include "buildvm_x86.h"
+#else
+#error "No support for this architecture (yet)"
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+void owrite(BuildCtx *ctx, const void *ptr, size_t sz)
+{
+  if (fwrite(ptr, 1, sz, ctx->fp) != sz) {
+    fprintf(stderr, "Error: cannot write to output file: %s\n",
+	    strerror(errno));
+    exit(1);
+  }
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Emit code as raw bytes. Only used for DynASM debugging. */
+static void emit_raw(BuildCtx *ctx)
+{
+  owrite(ctx, ctx->code, ctx->codesz);
+}
+
+/* -- Build machine code -------------------------------------------------- */
+
+/* Collect external relocations. */
+static int collect_reloc(BuildCtx *ctx, uint8_t *addr, int idx, int type)
+{
+  if (ctx->nreloc >= BUILD_MAX_RELOC) {
+    fprintf(stderr, "Error: too many relocations, increase BUILD_MAX_RELOC.\n");
+    exit(1);
+  }
+  ctx->reloc[ctx->nreloc].ofs = (int32_t)(addr - ctx->code);
+  ctx->reloc[ctx->nreloc].sym = idx;
+  ctx->reloc[ctx->nreloc].type = type;
+  ctx->nreloc++;
+  return 0;  /* Encode symbol offset of 0. */
+}
+
+/* Naive insertion sort. Performance doesn't matter here. */
+static void perm_insert(int *perm, int32_t *ofs, int i)
+{
+  perm[i] = i;
+  while (i > 0) {
+    int a = perm[i-1];
+    int b = perm[i];
+    if (ofs[a] <= ofs[b]) break;
+    perm[i] = a;
+    perm[i-1] = b;
+    i--;
+  }
+}
+
+/* Build the machine code. */
+static int build_code(BuildCtx *ctx)
+{
+  int status;
+  int i, j;
+
+  /* Initialize DynASM structures. */
+  ctx->nglob = GLOB__MAX;
+  ctx->glob = (void **)malloc(ctx->nglob*sizeof(void *));
+  memset(ctx->glob, 0, ctx->nglob*sizeof(void *));
+  ctx->nreloc = 0;
+
+  ctx->extnames = extnames;
+  ctx->globnames = globnames;
+
+  ctx->dasm_ident = DASM_IDENT;
+  ctx->dasm_arch = DASM_ARCH;
+
+  dasm_init(Dst, DASM_MAXSECTION);
+  dasm_setupglobal(Dst, ctx->glob, ctx->nglob);
+  dasm_setup(Dst, build_actionlist);
+
+  /* Call arch-specific backend to emit the code. */
+  ctx->npc = build_backend(ctx);
+
+  /* Finalize the code. */
+  (void)dasm_checkstep(Dst, DASM_SECTION_CODE);
+  if ((status = dasm_link(Dst, &ctx->codesz))) return status;
+  ctx->code = (uint8_t *)malloc(ctx->codesz);
+  if ((status = dasm_encode(Dst, (void *)ctx->code))) return status;
+
+  /* Allocate the symbol offset and permutation tables. */
+  ctx->nsym = ctx->npc + ctx->nglob;
+  ctx->perm = (int *)malloc((ctx->nsym+1)*sizeof(int *));
+  ctx->sym_ofs = (int32_t *)malloc((ctx->nsym+1)*sizeof(int32_t));
+
+  /* Collect the opcodes (PC labels). */
+  for (i = 0; i < ctx->npc; i++) {
+    int32_t n = dasm_getpclabel(Dst, i);
+    if (n < 0) return 0x22000000|i;
+    ctx->sym_ofs[i] = n;
+    perm_insert(ctx->perm, ctx->sym_ofs, i);
+  }
+
+  /* Collect the globals (named labels). */
+  for (j = 0; j < ctx->nglob; j++, i++) {
+    const char *gl = globnames[j];
+    int len = (int)strlen(gl);
+    if (!ctx->glob[j]) {
+      fprintf(stderr, "Error: undefined global %s\n", gl);
+      exit(2);
+    }
+    if (len >= 2 && gl[len-2] == '_' && gl[len-1] == 'Z')
+      ctx->sym_ofs[i] = -1;  /* Skip the _Z symbols. */
+    else
+      ctx->sym_ofs[i] = (int32_t)((uint8_t *)(ctx->glob[j]) - ctx->code);
+    perm_insert(ctx->perm, ctx->sym_ofs, i);
+  }
+
+  /* Close the address range. */
+  ctx->sym_ofs[i] = (int32_t)ctx->codesz;
+  perm_insert(ctx->perm, ctx->sym_ofs, i);
+
+  dasm_free(Dst);
+
+  return 0;
+}
+
+/* -- Generate VM enums --------------------------------------------------- */
+
+const char *const bc_names[] = {
+#define BCNAME(name, ma, mb, mc, mt)       #name,
+BCDEF(BCNAME)
+#undef BCNAME
+  NULL
+};
+
+const char *const ir_names[] = {
+#define IRNAME(name, m, m1, m2)	#name,
+IRDEF(IRNAME)
+#undef IRNAME
+  NULL
+};
+
+const char *const irfpm_names[] = {
+#define FPMNAME(name)		#name,
+IRFPMDEF(FPMNAME)
+#undef FPMNAME
+  NULL
+};
+
+const char *const irfield_names[] = {
+#define FLNAME(name, type, field)	#name,
+IRFLDEF(FLNAME)
+#undef FLNAME
+  NULL
+};
+
+static const char *const trace_errors[] = {
+#define TREDEF(name, msg)	msg,
+#include "lj_traceerr.h"
+  NULL
+};
+
+static const char *lower(char *buf, const char *s)
+{
+  char *p = buf;
+  while (*s) {
+    *p++ = (*s >= 'A' && *s <= 'Z') ? *s+0x20 : *s;
+    s++;
+  }
+  *p = '\0';
+  return buf;
+}
+
+/* Emit VM definitions as Lua code for debug modules. */
+static void emit_vmdef(BuildCtx *ctx)
+{
+  char buf[80];
+  int i;
+  fprintf(ctx->fp, "-- This is a generated file. DO NOT EDIT!\n\n");
+  fprintf(ctx->fp, "module(...)\n\n");
+
+  fprintf(ctx->fp, "bcnames = \"");
+  for (i = 0; bc_names[i]; i++) fprintf(ctx->fp, "%-6s", bc_names[i]);
+  fprintf(ctx->fp, "\"\n\n");
+
+  fprintf(ctx->fp, "irnames = \"");
+  for (i = 0; ir_names[i]; i++) fprintf(ctx->fp, "%-6s", ir_names[i]);
+  fprintf(ctx->fp, "\"\n\n");
+
+  fprintf(ctx->fp, "irfpm = { [0]=");
+  for (i = 0; irfpm_names[i]; i++)
+    fprintf(ctx->fp, "\"%s\", ", lower(buf, irfpm_names[i]));
+  fprintf(ctx->fp, "}\n\n");
+
+  fprintf(ctx->fp, "irfield = { [0]=");
+  for (i = 0; irfield_names[i]; i++) {
+    char *p;
+    lower(buf, irfield_names[i]);
+    p = strchr(buf, '_');
+    if (p) *p = '.';
+    fprintf(ctx->fp, "\"%s\", ", buf);
+  }
+  fprintf(ctx->fp, "}\n\n");
+
+  fprintf(ctx->fp, "traceerr = {\n[0]=");
+  for (i = 0; trace_errors[i]; i++)
+    fprintf(ctx->fp, "\"%s\",\n", trace_errors[i]);
+  fprintf(ctx->fp, "}\n\n");
+}
+
+/* -- Argument parsing ---------------------------------------------------- */
+
+/* Build mode names. */
+static const char *const modenames[] = {
+#define BUILDNAME(name)		#name,
+BUILDDEF(BUILDNAME)
+#undef BUILDNAME
+  NULL
+};
+
+/* Print usage information and exit. */
+static void usage(void)
+{
+  int i;
+  fprintf(stderr, LUAJIT_VERSION " VM builder.\n");
+  fprintf(stderr, LUAJIT_COPYRIGHT ", " LUAJIT_URL "\n");
+  fprintf(stderr, "Target architecture: " LJ_ARCH_NAME "\n\n");
+  fprintf(stderr, "Usage: buildvm -m mode [-o outfile] [infiles...]\n\n");
+  fprintf(stderr, "Available modes:\n");
+  for (i = 0; i < BUILD__MAX; i++)
+    fprintf(stderr, "  %s\n", modenames[i]);
+  exit(1);
+}
+
+/* Parse the output mode name. */
+static BuildMode parsemode(const char *mode)
+{
+  int i;
+  for (i = 0; modenames[i]; i++)
+    if (!strcmp(mode, modenames[i]))
+      return (BuildMode)i;
+  usage();
+  return (BuildMode)-1;
+}
+
+/* Parse arguments. */
+static void parseargs(BuildCtx *ctx, char **argv)
+{
+  const char *a;
+  int i;
+  ctx->mode = (BuildMode)-1;
+  ctx->outname = "-";
+  for (i = 1; (a = argv[i]) != NULL; i++) {
+    if (a[0] != '-')
+      break;
+    switch (a[1]) {
+    case '-':
+      if (a[2]) goto err;
+      i++;
+      goto ok;
+    case '\0':
+      goto ok;
+    case 'm':
+      i++;
+      if (a[2] || argv[i] == NULL) goto err;
+      ctx->mode = parsemode(argv[i]);
+      break;
+    case 'o':
+      i++;
+      if (a[2] || argv[i] == NULL) goto err;
+      ctx->outname = argv[i];
+      break;
+    default: err:
+      usage();
+      break;
+    }
+  }
+ok:
+  ctx->args = argv+i;
+  if (ctx->mode == (BuildMode)-1) goto err;
+}
+
+int main(int argc, char **argv)
+{
+  BuildCtx ctx_;
+  BuildCtx *ctx = &ctx_;
+  int status, binmode;
+
+  UNUSED(argc);
+  parseargs(ctx, argv);
+
+  if ((status = build_code(ctx))) {
+    fprintf(stderr,"Error: DASM error %08x\n", status);
+    return 1;
+  }
+
+  switch (ctx->mode) {
+#if LJ_TARGET_X86ORX64
+  case BUILD_peobj:
+#endif
+  case BUILD_raw:
+    binmode = 1;
+    break;
+  default:
+    binmode = 0;
+    break;
+  }
+
+  if (ctx->outname[0] == '-' && ctx->outname[1] == '\0') {
+    ctx->fp = stdout;
+#ifdef LUA_USE_WIN
+    if (binmode)
+      _setmode(_fileno(stdout), _O_BINARY);  /* Yuck. */
+#endif
+  } else if (!(ctx->fp = fopen(ctx->outname, binmode ? "wb" : "w"))) {
+    fprintf(stderr, "Error: cannot open output file '%s': %s\n",
+	    ctx->outname, strerror(errno));
+    exit(1);
+  }
+
+  switch (ctx->mode) {
+  case BUILD_asm:
+#if defined(__ELF__)
+    ctx->mode = BUILD_elfasm;
+#elif defined(__MACH__)
+    ctx->mode = BUILD_machasm;
+#else
+    fprintf(stderr,"Error: auto-guessing the system assembler failed\n");
+    return 1;
+#endif
+    /* fallthrough */
+  case BUILD_elfasm:
+  case BUILD_coffasm:
+  case BUILD_machasm:
+    emit_asm(ctx);
+    emit_asm_debug(ctx);
+    break;
+#if LJ_TARGET_X86ORX64
+  case BUILD_peobj:
+    emit_peobj(ctx);
+    break;
+#endif
+  case BUILD_raw:
+    emit_raw(ctx);
+    break;
+  case BUILD_vmdef:
+    emit_vmdef(ctx);
+    /* fallthrough */
+  case BUILD_ffdef:
+  case BUILD_libdef:
+  case BUILD_recdef:
+    emit_lib(ctx);
+    break;
+  case BUILD_folddef:
+    emit_fold(ctx);
+    break;
+  default:
+    break;
+  }
+
+  fflush(ctx->fp);
+  if (ferror(ctx->fp)) {
+    fprintf(stderr, "Error: cannot write to output file: %s\n",
+	    strerror(errno));
+    exit(1);
+  }
+  fclose(ctx->fp);
+
+  return 0;
+}
+

+ 106 - 0
src/buildvm.h

@@ -0,0 +1,106 @@
+/*
+** LuaJIT VM builder.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _BUILDVM_H
+#define _BUILDVM_H
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "lj_def.h"
+#include "lj_arch.h"
+
+/* Hardcoded limits. Increase as needed. */
+#define BUILD_MAX_RELOC		100	/* Max. number of relocations. */
+#define BUILD_MAX_FOLD		4096	/* Max. number of fold rules. */
+
+/* Prefix for scanned library definitions. */
+#define LIBDEF_PREFIX		"LJLIB_"
+
+/* Prefix for scanned fold definitions. */
+#define FOLDDEF_PREFIX		"LJFOLD"
+
+/* Prefixes for generated labels. */
+#define LABEL_PREFIX		"lj_"
+#define LABEL_PREFIX_BC		LABEL_PREFIX "BC_"
+#define LABEL_PREFIX_FF		LABEL_PREFIX "ff_"
+#define LABEL_PREFIX_CF		LABEL_PREFIX "cf_"
+#define LABEL_PREFIX_FFH	LABEL_PREFIX "ffh_"
+#define LABEL_PREFIX_LIBCF	LABEL_PREFIX "lib_cf_"
+#define LABEL_PREFIX_LIBINIT	LABEL_PREFIX "lib_init_"
+
+/* Extra labels. */
+#define LABEL_ASM_BEGIN		LABEL_PREFIX "vm_asm_begin"
+#define LABEL_OP_OFS		LABEL_PREFIX "vm_op_ofs"
+
+/* Forward declaration. */
+struct dasm_State;
+
+/* Build modes. */
+#if LJ_TARGET_X86ORX64
+#define BUILDDEFX(_)	_(peobj)
+#else
+#define BUILDDEFX(_)
+#endif
+
+#define BUILDDEF(_) \
+  _(asm) _(elfasm) _(coffasm) _(machasm) BUILDDEFX(_) _(raw) \
+  _(ffdef) _(libdef) _(recdef) _(vmdef) \
+  _(folddef)
+
+typedef enum {
+#define BUILDENUM(name)		BUILD_##name,
+BUILDDEF(BUILDENUM)
+#undef BUILDENUM
+  BUILD__MAX
+} BuildMode;
+
+/* Code relocation. */
+typedef struct BuildReloc {
+  int32_t ofs;
+  int sym;
+  int type;
+} BuildReloc;
+
+/* Build context structure. */
+typedef struct BuildCtx {
+  /* DynASM state pointer. Should be first member. */
+  struct dasm_State *D;
+  /* Parsed command line. */
+  BuildMode mode;
+  FILE *fp;
+  const char *outname;
+  char **args;
+  /* Code and symbols generated by DynASM. */
+  uint8_t *code;
+  size_t codesz;
+  int npc, nglob, nsym, nreloc;
+  void **glob;
+  int *perm;
+  int32_t *sym_ofs;
+  /* Strings generated by DynASM. */
+  const char *const *extnames;
+  const char *const *globnames;
+  const char *dasm_ident;
+  const char *dasm_arch;
+  /* Relocations. */
+  BuildReloc reloc[BUILD_MAX_RELOC];
+} BuildCtx;
+
+extern void owrite(BuildCtx *ctx, const void *ptr, size_t sz);
+extern void emit_asm(BuildCtx *ctx);
+extern void emit_peobj(BuildCtx *ctx);
+extern void emit_lib(BuildCtx *ctx);
+extern void emit_fold(BuildCtx *ctx);
+
+extern const char *const bc_names[];
+extern const char *const ir_names[];
+extern const char *const irfpm_names[];
+extern const char *const irfield_names[];
+
+#endif

+ 220 - 0
src/buildvm_asm.c

@@ -0,0 +1,220 @@
+/*
+** LuaJIT VM builder: Assembler source code emitter.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#include "buildvm.h"
+#include "lj_bc.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* Emit bytes piecewise as assembler text. */
+static void emit_asm_bytes(BuildCtx *ctx, uint8_t *p, int n)
+{
+  int i;
+  for (i = 0; i < n; i++) {
+    if ((i & 15) == 0)
+      fprintf(ctx->fp, "\t.byte %d", p[i]);
+    else
+      fprintf(ctx->fp, ",%d", p[i]);
+    if ((i & 15) == 15) putc('\n', ctx->fp);
+  }
+  if ((n & 15) != 0) putc('\n', ctx->fp);
+}
+
+/* Emit relocation */
+static void emit_asm_reloc(BuildCtx *ctx, BuildReloc *r)
+{
+  const char *sym = ctx->extnames[r->sym];
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    if (r->type)
+      fprintf(ctx->fp, "\t.long %s-.-4\n", sym);
+    else
+      fprintf(ctx->fp, "\t.long %s\n", sym);
+    break;
+  case BUILD_coffasm:
+    fprintf(ctx->fp, "\t.def _%s; .scl 3; .type 32; .endef\n", sym);
+    if (r->type)
+      fprintf(ctx->fp, "\t.long _%s-.-4\n", sym);
+    else
+      fprintf(ctx->fp, "\t.long _%s\n", sym);
+    break;
+  default:  /* BUILD_machasm for relative relocations handled below. */
+    fprintf(ctx->fp, "\t.long _%s\n", sym);
+    break;
+  }
+}
+
+static const char *const jccnames[] = {
+  "jo", "jno", "jb", "jnb", "jz", "jnz", "jbe", "ja",
+  "js", "jns", "jpe", "jpo", "jl", "jge", "jle", "jg"
+};
+
+/* Emit relocation for the incredibly stupid OSX assembler. */
+static void emit_asm_reloc_mach(BuildCtx *ctx, uint8_t *cp, int n,
+				const char *sym)
+{
+  const char *opname = NULL;
+  if (--n < 0) goto err;
+  if (cp[n] == 0xe8) {
+    opname = "call";
+  } else if (cp[n] == 0xe9) {
+    opname = "jmp";
+  } else if (cp[n] >= 0x80 && cp[n] <= 0x8f && n > 0 && cp[n-1] == 0x0f) {
+    opname = jccnames[cp[n]-0x80];
+    n--;
+  } else {
+err:
+    fprintf(stderr, "Error: unsupported opcode for %s symbol relocation.\n",
+	    sym);
+    exit(1);
+  }
+  emit_asm_bytes(ctx, cp, n);
+  if (!strncmp(sym, LABEL_PREFIX, sizeof(LABEL_PREFIX)-1))
+    fprintf(ctx->fp, "\t%s _%s\n", opname, sym);
+  else
+    fprintf(ctx->fp, "\t%s _" LABEL_PREFIX "wrapper_%s\n", opname, sym);
+}
+
+/* Emit an assembler label. */
+static void emit_asm_label(BuildCtx *ctx, const char *name, int size, int isfunc)
+{
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    fprintf(ctx->fp,
+      "\n\t.globl %s\n"
+      "\t.hidden %s\n"
+      "\t.type %s, @%s\n"
+      "\t.size %s, %d\n"
+      "%s:\n",
+      name, name, name, isfunc ? "function" : "object", name, size, name);
+    break;
+  case BUILD_coffasm:
+    fprintf(ctx->fp, "\n\t.globl _%s\n", name);
+    if (isfunc)
+      fprintf(ctx->fp, "\t.def _%s; .scl 3; .type 32; .endef\n", name);
+    fprintf(ctx->fp, "_%s:\n", name);
+    break;
+  case BUILD_machasm:
+    fprintf(ctx->fp,
+      "\n\t.private_extern _%s\n"
+      "_%s:\n", name, name);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Emit alignment. */
+static void emit_asm_align(BuildCtx *ctx, int bits)
+{
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+  case BUILD_coffasm:
+    fprintf(ctx->fp, "\t.p2align %d\n", bits);
+    break;
+  case BUILD_machasm:
+    fprintf(ctx->fp, "\t.align %d\n", bits);
+    break;
+  default:
+    break;
+  }
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Emit assembler source code. */
+void emit_asm(BuildCtx *ctx)
+{
+  char name[80];
+  int32_t prev;
+  int i, pi, rel;
+
+  fprintf(ctx->fp, "\t.file \"buildvm_%s.dasc\"\n", ctx->dasm_arch);
+  fprintf(ctx->fp, "\t.text\n");
+  emit_asm_align(ctx, 4);
+
+  emit_asm_label(ctx, LABEL_ASM_BEGIN, 0, 1);
+  if (ctx->mode == BUILD_elfasm)
+    fprintf(ctx->fp, ".Lbegin:\n");
+
+  i = 0;
+  do {
+    pi = ctx->perm[i++];
+    prev = ctx->sym_ofs[pi];
+  } while (prev < 0);  /* Skip the _Z symbols. */
+
+  for (rel = 0; i <= ctx->nsym; i++) {
+    int ni = ctx->perm[i];
+    int32_t next = ctx->sym_ofs[ni];
+    int size = (int)(next - prev);
+    int32_t stop = next;
+    if (pi >= ctx->npc) {
+      sprintf(name, LABEL_PREFIX "%s", ctx->globnames[pi-ctx->npc]);
+      emit_asm_label(ctx, name, size, 1);
+#if LJ_HASJIT
+    } else {
+#else
+    } else if (!(pi == BC_JFORI || pi == BC_JFORL || pi == BC_JITERL ||
+		 pi == BC_JLOOP || pi == BC_IFORL || pi == BC_IITERL ||
+		 pi == BC_ILOOP)) {
+#endif
+      sprintf(name, LABEL_PREFIX_BC "%s", bc_names[pi]);
+      emit_asm_label(ctx, name, size, 1);
+    }
+    while (rel < ctx->nreloc && ctx->reloc[rel].ofs < stop) {
+      int n = ctx->reloc[rel].ofs - prev;
+      if (ctx->mode == BUILD_machasm && ctx->reloc[rel].type != 0) {
+	emit_asm_reloc_mach(ctx, ctx->code+prev, n,
+			    ctx->extnames[ctx->reloc[rel].sym]);
+      } else {
+	emit_asm_bytes(ctx, ctx->code+prev, n);
+	emit_asm_reloc(ctx, &ctx->reloc[rel]);
+      }
+      prev += n+4;
+      rel++;
+    }
+    emit_asm_bytes(ctx, ctx->code+prev, stop-prev);
+    prev = next;
+    pi = ni;
+  }
+
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    fprintf(ctx->fp, "\n\t.section .rodata\n");
+    break;
+  case BUILD_coffasm:
+    fprintf(ctx->fp, "\n\t.section .rdata,\"dr\"\n");
+    break;
+  case BUILD_machasm:
+    fprintf(ctx->fp, "\n\t.const\n");
+    break;
+  default:
+    break;
+  }
+  emit_asm_align(ctx, 5);
+
+  emit_asm_label(ctx, LABEL_OP_OFS, 2*ctx->npc, 0);
+  for (i = 0; i < ctx->npc; i++)
+    fprintf(ctx->fp, "\t.short %d\n", ctx->sym_ofs[i]);
+
+  fprintf(ctx->fp, "\n");
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    fprintf(ctx->fp, "\t.section .note.GNU-stack,\"\",@progbits\n");
+    /* fallthrough */
+  case BUILD_coffasm:
+    fprintf(ctx->fp, "\t.ident \"%s\"\n", ctx->dasm_ident);
+    break;
+  case BUILD_machasm:
+    fprintf(ctx->fp,
+      "\t.cstring\n"
+      "\t.ascii \"%s\\0\"\n", ctx->dasm_ident);
+    break;
+  default:
+    break;
+  }
+  fprintf(ctx->fp, "\n");
+}
+

+ 206 - 0
src/buildvm_fold.c

@@ -0,0 +1,206 @@
+/*
+** LuaJIT VM builder: IR folding hash table generator.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#include "lj_obj.h"
+#include "lj_ir.h"
+
+#include "buildvm.h"
+
+/* Context for the folding hash table generator. */
+static int lineno;
+static int funcidx;
+static uint32_t foldkeys[BUILD_MAX_FOLD];
+static uint32_t nkeys;
+
+/* Try to fill the hash table with keys using the hash parameters. */
+static int tryhash(uint32_t *htab, uint32_t sz, uint32_t r, int dorol)
+{
+  uint32_t i;
+  if (dorol && ((r & 31) == 0 || (r>>5) == 0))
+    return 0;  /* Avoid zero rotates. */
+  memset(htab, 0xff, (sz+1)*sizeof(uint32_t));
+  for (i = 0; i < nkeys; i++) {
+    uint32_t key = foldkeys[i];
+    uint32_t k = key & 0xffffff;
+    uint32_t h = (dorol ? lj_rol(lj_rol(k, r>>5) - k, r&31) :
+			  (((k << (r>>5)) - k) << (r&31))) % sz;
+    if (htab[h] != 0xffffffff) {  /* Collision on primary slot. */
+      if (htab[h+1] != 0xffffffff) {  /* Collision on secondary slot. */
+	/* Try to move the colliding key, if possible. */
+	if (h < sz-1 && htab[h+2] == 0xffffffff) {
+	  uint32_t k2 = htab[h+1] & 0xffffff;
+	  uint32_t h2 = (dorol ? lj_rol(lj_rol(k2, r>>5) - k2, r&31) :
+				 (((k2 << (r>>5)) - k2) << (r&31))) % sz;
+	  if (h2 != h+1) return 0;  /* Cannot resolve collision. */
+	  htab[h+2] = htab[h+1];  /* Move colliding key to secondary slot. */
+	} else {
+	  return 0;  /* Collision. */
+	}
+      }
+      htab[h+1] = key;
+    } else {
+      htab[h] = key;
+    }
+  }
+  return 1;  /* Success, all keys could be stored. */
+}
+
+/* Print the generated hash table. */
+static void printhash(BuildCtx *ctx, uint32_t *htab, uint32_t sz)
+{
+  uint32_t i;
+  fprintf(ctx->fp, "static const uint32_t fold_hash[%d] = {\n0x%08x",
+	  sz+1, htab[0]);
+  for (i = 1; i < sz+1; i++)
+    fprintf(ctx->fp, ",\n0x%08x", htab[i]);
+  fprintf(ctx->fp, "\n};\n\n");
+}
+
+/* Exhaustive search for the shortest semi-perfect hash table. */
+static void makehash(BuildCtx *ctx)
+{
+  uint32_t htab[BUILD_MAX_FOLD*2+1];
+  uint32_t sz, r;
+  /* Search for the smallest hash table with an odd size. */
+  for (sz = (nkeys|1); sz < BUILD_MAX_FOLD*2; sz += 2) {
+    /* First try all shift hash combinations. */
+    for (r = 0; r < 32*32; r++) {
+      if (tryhash(htab, sz, r, 0)) {
+	printhash(ctx, htab, sz);
+	fprintf(ctx->fp,
+		"#define fold_hashkey(k)\t(((((k)<<%u)-(k))<<%u)%%%u)\n\n",
+		r>>5, r&31, sz);
+	return;
+      }
+    }
+    /* Then try all rotate hash combinations. */
+    for (r = 0; r < 32*32; r++) {
+      if (tryhash(htab, sz, r, 1)) {
+	printhash(ctx, htab, sz);
+	fprintf(ctx->fp,
+	  "#define fold_hashkey(k)\t(lj_rol(lj_rol((k),%u)-(k),%u)%%%u)\n\n",
+		r>>5, r&31, sz);
+	return;
+      }
+    }
+  }
+  fprintf(stderr, "Error: search for perfect hash failed\n");
+  exit(1);
+}
+
+/* Parse one token of a fold rule. */
+static uint32_t nexttoken(char **pp, int allowlit, int allowany)
+{
+  char *p = *pp;
+  if (p) {
+    uint32_t i;
+    char *q = strchr(p, ' ');
+    if (q) *q++ = '\0';
+    *pp = q;
+    if (allowlit && !strncmp(p, "IRFPM_", 6)) {
+      for (i = 0; irfpm_names[i]; i++)
+	if (!strcmp(irfpm_names[i], p+6))
+	  return i;
+    } else if (allowlit && !strncmp(p, "IRFL_", 5)) {
+      for (i = 0; irfield_names[i]; i++)
+	if (!strcmp(irfield_names[i], p+5))
+	  return i;
+    } else if (allowany && !strcmp("any", p)) {
+      return 0xff;
+    } else {
+      for (i = 0; ir_names[i]; i++)
+	if (!strcmp(ir_names[i], p))
+	  return i;
+    }
+    fprintf(stderr, "Error: bad fold definition token \"%s\" at line %d\n", p, lineno);
+    exit(1);
+  }
+  return 0;
+}
+
+/* Parse a fold rule. */
+static void foldrule(char *p)
+{
+  uint32_t op = nexttoken(&p, 0, 0);
+  uint32_t left = nexttoken(&p, 0, 1);
+  uint32_t right = nexttoken(&p, 1, 1);
+  uint32_t key = (funcidx << 24) | (op << 16) | (left << 8) | right;
+  uint32_t i;
+  if (nkeys >= BUILD_MAX_FOLD) {
+    fprintf(stderr, "Error: too many fold rules, increase BUILD_MAX_FOLD.\n");
+    exit(1);
+  }
+  /* Simple insertion sort to detect duplicates. */
+  for (i = nkeys; i > 0; i--) {
+    if ((foldkeys[i-1]&0xffffff) < (key & 0xffffff))
+      break;
+    if ((foldkeys[i-1]&0xffffff) == (key & 0xffffff)) {
+      fprintf(stderr, "Error: duplicate fold definition at line %d\n", lineno);
+      exit(1);
+    }
+    foldkeys[i] = foldkeys[i-1];
+  }
+  foldkeys[i] = key;
+  nkeys++;
+}
+
+/* Emit C source code for IR folding hash table. */
+void emit_fold(BuildCtx *ctx)
+{
+  char buf[256];  /* We don't care about analyzing lines longer than that. */
+  const char *fname = ctx->args[0];
+  FILE *fp;
+
+  if (fname == NULL) {
+    fprintf(stderr, "Error: missing input filename\n");
+    exit(1);
+  }
+
+  if (fname[0] == '-' && fname[1] == '\0') {
+    fp = stdin;
+  } else {
+    fp = fopen(fname, "r");
+    if (!fp) {
+      fprintf(stderr, "Error: cannot open input file '%s': %s\n",
+	      fname, strerror(errno));
+      exit(1);
+    }
+  }
+
+  fprintf(ctx->fp, "/* This is a generated file. DO NOT EDIT! */\n\n");
+  fprintf(ctx->fp, "static const FoldFunc fold_func[] = {\n");
+
+  lineno = 0;
+  funcidx = 0;
+  nkeys = 0;
+  while (fgets(buf, sizeof(buf), fp) != NULL) {
+    lineno++;
+    /* The prefix must be at the start of a line, otherwise it's ignored. */
+    if (!strncmp(buf, FOLDDEF_PREFIX, sizeof(FOLDDEF_PREFIX)-1)) {
+      char *p = buf+sizeof(FOLDDEF_PREFIX)-1;
+      char *q = strchr(p, ')');
+      if (p[0] == '(' && q) {
+	p++;
+	*q = '\0';
+	foldrule(p);
+      } else if ((p[0] == 'F' || p[0] == 'X') && p[1] == '(' && q) {
+	p += 2;
+	*q = '\0';
+	fprintf(ctx->fp, funcidx ? ",\n  %s" : "  %s", p);
+	funcidx++;
+      } else {
+	buf[strlen(buf)-1] = '\0';
+	fprintf(stderr, "Error: unknown fold definition tag %s%s at line %d\n",
+		FOLDDEF_PREFIX, p, lineno);
+	exit(1);
+      }
+    }
+  }
+  fclose(fp);
+  fprintf(ctx->fp, "\n};\n\n");
+
+  makehash(ctx);
+}
+

+ 365 - 0
src/buildvm_lib.c

@@ -0,0 +1,365 @@
+/*
+** LuaJIT VM builder: library definition compiler.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#include "lj_obj.h"
+#include "lj_lib.h"
+
+#include "buildvm.h"
+
+/* Context for library definitions. */
+static uint8_t obuf[8192];
+static uint8_t *optr;
+static char modname[80];
+static size_t modnamelen;
+static char funcname[80];
+static int modstate, regfunc;
+static int ffid, recffid;
+
+enum {
+  REGFUNC_OK,
+  REGFUNC_NOREG,
+  REGFUNC_NOREGUV
+};
+
+static void libdef_name(char *p, int kind)
+{
+  size_t n = strlen(p);
+  if (kind != LIBINIT_STRING) {
+    if (n > modnamelen && p[modnamelen] == '_' &&
+	!strncmp(p, modname, modnamelen)) {
+      p += modnamelen+1;
+      n -= modnamelen+1;
+    }
+  }
+  if (n > LIBINIT_MAXSTR) {
+    fprintf(stderr, "Error: string too long: '%s'\n",  p);
+    exit(1);
+  }
+  if (optr+1+n+2 > obuf+sizeof(obuf)) {  /* +2 for caller. */
+    fprintf(stderr, "Error: output buffer overflow\n");
+    exit(1);
+  }
+  *optr++ = (uint8_t)(n | kind);
+  memcpy(optr, p, n);
+  optr += n;
+}
+
+static void libdef_endmodule(BuildCtx *ctx)
+{
+  if (modstate != 0) {
+    char line[80];
+    const uint8_t *p;
+    int n;
+    if (modstate == 1)
+      fprintf(ctx->fp, "  (lua_CFunction)0");
+    fprintf(ctx->fp, "\n};\n");
+    fprintf(ctx->fp, "static const uint8_t %s%s[] = {\n",
+	    LABEL_PREFIX_LIBINIT, modname);
+    line[0] = '\0';
+    for (n = 0, p = obuf; p < optr; p++) {
+      n += sprintf(line+n, "%d,", *p);
+      if (n >= 75) {
+	fprintf(ctx->fp, "%s\n", line);
+	n = 0;
+	line[0] = '\0';
+      }
+    }
+    fprintf(ctx->fp, "%s%d\n};\n#endif\n\n", line, LIBINIT_END);
+  }
+}
+
+static void libdef_module(BuildCtx *ctx, char *p, int arg)
+{
+  UNUSED(arg);
+  if (ctx->mode == BUILD_libdef) {
+    libdef_endmodule(ctx);
+    optr = obuf;
+    *optr++ = (uint8_t)ffid;
+    *optr++ = 0;
+    modstate = 1;
+    fprintf(ctx->fp, "#ifdef %sMODULE_%s\n", LIBDEF_PREFIX, p);
+    fprintf(ctx->fp, "#undef %sMODULE_%s\n", LIBDEF_PREFIX, p);
+    fprintf(ctx->fp, "static const lua_CFunction %s%s[] = {\n",
+	    LABEL_PREFIX_LIBCF, p);
+  }
+  modnamelen = strlen(p);
+  if (modnamelen > sizeof(modname)-1) {
+    fprintf(stderr, "Error: module name too long: '%s'\n", p);
+    exit(1);
+  }
+  strcpy(modname, p);
+}
+
+static int find_ffofs(BuildCtx *ctx, const char *name)
+{
+  int i;
+  for (i = 0; i < ctx->nglob; i++) {
+    const char *gl = ctx->globnames[i];
+    if (gl[0] == 'f' && gl[1] == 'f' && gl[2] == '_' && !strcmp(gl+3, name)) {
+      return (int)((uint8_t *)ctx->glob[i] - ctx->code);
+    }
+  }
+  fprintf(stderr, "Error: undefined fast function %s%s\n",
+	  LABEL_PREFIX_FF, name);
+  exit(1);
+}
+
+static void libdef_func(BuildCtx *ctx, char *p, int arg)
+{
+  if (ctx->mode == BUILD_libdef) {
+    int ofs = arg != LIBINIT_CF ? find_ffofs(ctx, p) : 0;
+    if (modstate == 0) {
+      fprintf(stderr, "Error: no module for function definition %s\n", p);
+      exit(1);
+    }
+    if (regfunc == REGFUNC_NOREG) {
+      if (optr+1 > obuf+sizeof(obuf)) {
+	fprintf(stderr, "Error: output buffer overflow\n");
+	exit(1);
+      }
+      *optr++ = LIBINIT_FFID;
+    } else {
+      if (arg != LIBINIT_ASM_) {
+	if (modstate != 1) fprintf(ctx->fp, ",\n");
+	modstate = 2;
+	fprintf(ctx->fp, "  %s%s", arg ? LABEL_PREFIX_FFH : LABEL_PREFIX_CF, p);
+      }
+      if (regfunc != REGFUNC_NOREGUV) obuf[1]++;  /* Bump hash table size. */
+      libdef_name(regfunc == REGFUNC_NOREGUV ? "" : p, arg);
+      if (arg) {
+	*optr++ = (uint8_t)ofs;
+	*optr++ = (uint8_t)(ofs >> 8);
+      }
+    }
+  } else if (ctx->mode == BUILD_ffdef) {
+    fprintf(ctx->fp, "FFDEF(%s)\n", p);
+  } else if (ctx->mode == BUILD_recdef) {
+    if (strlen(p) > sizeof(funcname)-1) {
+      fprintf(stderr, "Error: function name too long: '%s'\n", p);
+      exit(1);
+    }
+    strcpy(funcname, p);
+  } else if (ctx->mode == BUILD_vmdef) {
+    int i;
+    for (i = 1; p[i] && modname[i-1]; i++)
+      if (p[i] == '_') p[i] = '.';
+    fprintf(ctx->fp, "\"%s\",\n", p);
+  }
+  ffid++;
+  regfunc = REGFUNC_OK;
+}
+
+static uint32_t find_rec(char *name)
+{
+  char *p = (char *)obuf;
+  uint32_t n;
+  for (n = 2; *p; n++) {
+    if (strcmp(p, name) == 0)
+      return n;
+    p += strlen(p)+1;
+  }
+  if (p+strlen(name)+1 >= (char *)obuf+sizeof(obuf)) {
+    fprintf(stderr, "Error: output buffer overflow\n");
+    exit(1);
+  }
+  strcpy(p, name);
+  return n;
+}
+
+static void libdef_rec(BuildCtx *ctx, char *p, int arg)
+{
+  UNUSED(arg);
+  if (ctx->mode == BUILD_recdef) {
+    char *q;
+    uint32_t n;
+    for (; recffid+1 < ffid; recffid++)
+      fprintf(ctx->fp, ",\n0");
+    recffid = ffid;
+    if (*p == '.') p = funcname;
+    q = strchr(p, ' ');
+    if (q) *q++ = '\0';
+    n = find_rec(p);
+    if (q)
+      fprintf(ctx->fp, ",\n0x%02x00+(%s)", n, q);
+    else
+      fprintf(ctx->fp, ",\n0x%02x00", n);
+  }
+}
+
+static void memcpy_endian(void *dst, void *src, size_t n)
+{
+  union { uint8_t b; uint32_t u; } host_endian;
+  host_endian.u = 1;
+  if (host_endian.b == LJ_ENDIAN_SELECT(1, 0)) {
+    memcpy(dst, src, n);
+  } else {
+    size_t i;
+    for (i = 0; i < n; i++)
+      ((uint8_t *)dst)[i] = ((uint8_t *)src)[n-i];
+  }
+}
+
+static void libdef_push(BuildCtx *ctx, char *p, int arg)
+{
+  UNUSED(arg);
+  if (ctx->mode == BUILD_libdef) {
+    int len = (int)strlen(p);
+    if (*p == '"') {
+      if (len > 1 && p[len-1] == '"') {
+	p[len-1] = '\0';
+	libdef_name(p+1, LIBINIT_STRING);
+	return;
+      }
+    } else if (*p >= '0' && *p <= '9') {
+      char *ep;
+      double d = strtod(p, &ep);
+      if (*ep == '\0') {
+	if (optr+1+sizeof(double) > obuf+sizeof(obuf)) {
+	  fprintf(stderr, "Error: output buffer overflow\n");
+	  exit(1);
+	}
+	*optr++ = LIBINIT_NUMBER;
+	memcpy_endian(optr, &d, sizeof(double));
+	optr += sizeof(double);
+	return;
+      }
+    } else if (!strcmp(p, "lastcl")) {
+      if (optr+1 > obuf+sizeof(obuf)) {
+	fprintf(stderr, "Error: output buffer overflow\n");
+	exit(1);
+      }
+      *optr++ = LIBINIT_LASTCL;
+      return;
+    } else if (len > 4 && !strncmp(p, "top-", 4)) {
+      if (optr+2 > obuf+sizeof(obuf)) {
+	fprintf(stderr, "Error: output buffer overflow\n");
+	exit(1);
+      }
+      *optr++ = LIBINIT_COPY;
+      *optr++ = (uint8_t)atoi(p+4);
+      return;
+    }
+    fprintf(stderr, "Error: bad value for %sPUSH(%s)\n", LIBDEF_PREFIX, p);
+    exit(1);
+  }
+}
+
+static void libdef_set(BuildCtx *ctx, char *p, int arg)
+{
+  UNUSED(arg);
+  if (ctx->mode == BUILD_libdef) {
+    if (p[0] == '!' && p[1] == '\0') p[0] = '\0';  /* Set env. */
+    libdef_name(p, LIBINIT_STRING);
+    *optr++ = LIBINIT_SET;
+    obuf[1]++;  /* Bump hash table size. */
+  }
+}
+
+static void libdef_regfunc(BuildCtx *ctx, char *p, int arg)
+{
+  UNUSED(ctx); UNUSED(p);
+  regfunc = arg;
+}
+
+typedef void (*LibDefFunc)(BuildCtx *ctx, char *p, int arg);
+
+typedef struct LibDefHandler {
+  const char *suffix;
+  const char *stop;
+  const LibDefFunc func;
+  const int arg;
+} LibDefHandler;
+
+static const LibDefHandler libdef_handlers[] = {
+  { "MODULE_",	" \t\r\n",	libdef_module,		0 },
+  { "CF(",	")",		libdef_func,		LIBINIT_CF },
+  { "ASM(",	")",		libdef_func,		LIBINIT_ASM },
+  { "ASM_(",	")",		libdef_func,		LIBINIT_ASM_ },
+  { "REC(",	")",		libdef_rec,		0 },
+  { "PUSH(",	")",		libdef_push,		0 },
+  { "SET(",	")",		libdef_set,		0 },
+  { "NOREGUV",	NULL,		libdef_regfunc,		REGFUNC_NOREGUV },
+  { "NOREG",	NULL,		libdef_regfunc,		REGFUNC_NOREG },
+  { NULL,	NULL,		(LibDefFunc)0,		0 }
+};
+
+/* Emit C source code for library function definitions. */
+void emit_lib(BuildCtx *ctx)
+{
+  const char *fname;
+
+  if (ctx->mode == BUILD_ffdef || ctx->mode == BUILD_libdef ||
+      ctx->mode == BUILD_recdef)
+    fprintf(ctx->fp, "/* This is a generated file. DO NOT EDIT! */\n\n");
+  else if (ctx->mode == BUILD_vmdef)
+    fprintf(ctx->fp, "ffnames = {\n[0]=\"Lua\",\n\"C\",\n");
+  if (ctx->mode == BUILD_recdef)
+    fprintf(ctx->fp, "static const uint16_t recff_idmap[] = {\n0,\n0x0100");
+  recffid = ffid = FF_C+1;
+
+  while ((fname = *ctx->args++)) {
+    char buf[256];  /* We don't care about analyzing lines longer than that. */
+    FILE *fp;
+    if (fname[0] == '-' && fname[1] == '\0') {
+      fp = stdin;
+    } else {
+      fp = fopen(fname, "r");
+      if (!fp) {
+	fprintf(stderr, "Error: cannot open input file '%s': %s\n",
+		fname, strerror(errno));
+	exit(1);
+      }
+    }
+    modstate = 0;
+    regfunc = REGFUNC_OK;
+    while (fgets(buf, sizeof(buf), fp) != NULL) {
+      char *p;
+      for (p = buf; (p = strstr(p, LIBDEF_PREFIX)) != NULL; ) {
+	const LibDefHandler *ldh;
+	p += sizeof(LIBDEF_PREFIX)-1;
+	for (ldh = libdef_handlers; ldh->suffix != NULL; ldh++) {
+	  size_t n, len = strlen(ldh->suffix);
+	  if (!strncmp(p, ldh->suffix, len)) {
+	    p += len;
+	    n = ldh->stop ? strcspn(p, ldh->stop) : 0;
+	    if (!p[n]) break;
+	    p[n] = '\0';
+	    ldh->func(ctx, p, ldh->arg);
+	    p += n+1;
+	    break;
+	  }
+	}
+	if (ldh->suffix == NULL) {
+	  buf[strlen(buf)-1] = '\0';
+	  fprintf(stderr, "Error: unknown library definition tag %s%s\n",
+		  LIBDEF_PREFIX, p);
+	  exit(1);
+	}
+      }
+    }
+    fclose(fp);
+    if (ctx->mode == BUILD_libdef) {
+      libdef_endmodule(ctx);
+    }
+  }
+
+  if (ctx->mode == BUILD_ffdef) {
+    fprintf(ctx->fp, "\n#undef FFDEF\n\n");
+  } else if (ctx->mode == BUILD_vmdef) {
+    fprintf(ctx->fp, "}\n\n");
+  } else if (ctx->mode == BUILD_recdef) {
+    char *p = (char *)obuf;
+    fprintf(ctx->fp, "\n};\n\n");
+    fprintf(ctx->fp, "static const RecordFunc recff_func[] = {\n"
+	    "recff_nyi,\n"
+	    "recff_c");
+    while (*p) {
+      fprintf(ctx->fp, ",\nrecff_%s", p);
+      p += strlen(p)+1;
+    }
+    fprintf(ctx->fp, "\n};\n\n");
+  }
+}
+

+ 303 - 0
src/buildvm_peobj.c

@@ -0,0 +1,303 @@
+/*
+** LuaJIT VM builder: PE object emitter.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Only used for building on Windows, since we cannot assume the presence
+** of a suitable assembler. The host and target byte order must match.
+*/
+
+#include "buildvm.h"
+#include "lj_bc.h"
+
+#if LJ_TARGET_X86ORX64
+
+/* Context for PE object emitter. */
+static char *strtab;
+static size_t strtabofs;
+
+/* -- PE object definitions ----------------------------------------------- */
+
+/* PE header. */
+typedef struct PEheader {
+  uint16_t arch;
+  uint16_t nsects;
+  uint32_t time;
+  uint32_t symtabofs;
+  uint32_t nsyms;
+  uint16_t opthdrsz;
+  uint16_t flags;
+} PEheader;
+
+/* PE section. */
+typedef struct PEsection {
+  char name[8];
+  uint32_t vsize;
+  uint32_t vaddr;
+  uint32_t size;
+  uint32_t ofs;
+  uint32_t relocofs;
+  uint32_t lineofs;
+  uint16_t nreloc;
+  uint16_t nline;
+  uint32_t flags;
+} PEsection;
+
+/* PE relocation. */
+typedef struct PEreloc {
+  uint32_t vaddr;
+  uint32_t symidx;
+  uint16_t type;
+} PEreloc;
+
+/* Cannot use sizeof, because it pads up to the max. alignment. */
+#define PEOBJ_RELOC_SIZE	(4+4+2)
+
+/* PE symbol table entry. */
+typedef struct PEsym {
+  union {
+    char name[8];
+    uint32_t nameref[2];
+  } n;
+  uint32_t value;
+  int16_t sect;
+  uint16_t type;
+  uint8_t scl;
+  uint8_t naux;
+} PEsym;
+
+/* PE symbol table auxiliary entry for a section. */
+typedef struct PEsymaux {
+  uint32_t size;
+  uint16_t nreloc;
+  uint16_t nline;
+  uint32_t cksum;
+  uint16_t assoc;
+  uint8_t comdatsel;
+  uint8_t unused[3];
+} PEsymaux;
+
+/* Cannot use sizeof, because it pads up to the max. alignment. */
+#define PEOBJ_SYM_SIZE	(8+4+2+2+1+1)
+
+/* PE object CPU specific defines. */
+#if LJ_TARGET_X86
+#define PEOBJ_ARCH_TARGET	0x014c
+#define PEOBJ_RELOC_REL32	0x14  /* MS: REL32, GNU: DISP32. */
+#define PEOBJ_RELOC_DIR32	0x06
+#define PEOBJ_SYM_PREFIX	"_"
+#elif LJ_TARGET_X64
+#define PEOBJ_ARCH_TARGET	0x8664
+#define PEOBJ_RELOC_REL32	0x04  /* MS: REL32, GNU: DISP32. */
+#define PEOBJ_RELOC_DIR32	0x02
+#define PEOBJ_SYM_PREFIX	""
+#endif
+
+/* Section numbers (0-based). */
+enum {
+  PEOBJ_SECT_ABS = -2,
+  PEOBJ_SECT_UNDEF = -1,
+  PEOBJ_SECT_TEXT,
+  /* TODO: add .pdata/.xdata for x64. */
+  PEOBJ_SECT_RDATA,
+  PEOBJ_SECT_RDATA_Z,
+  PEOBJ_NSECTIONS
+};
+
+/* Symbol types. */
+#define PEOBJ_TYPE_NULL		0
+#define PEOBJ_TYPE_FUNC		0x20
+
+/* Symbol storage class. */
+#define PEOBJ_SCL_EXTERN	2
+#define PEOBJ_SCL_STATIC	3
+
+/* -- PE object emitter --------------------------------------------------- */
+
+/* Emit PE object symbol. */
+static void emit_peobj_sym(BuildCtx *ctx, const char *name, uint32_t value,
+			   int sect, int type, int scl)
+{
+  PEsym sym;
+  size_t len = strlen(name);
+  if (!strtab) {  /* Pass 1: only calculate string table length. */
+    if (len > 8) strtabofs += len+1;
+    return;
+  }
+  if (len <= 8) {
+    memcpy(sym.n.name, name, len);
+    memset(sym.n.name+len, 0, 8-len);
+  } else {
+    sym.n.nameref[0] = 0;
+    sym.n.nameref[1] = strtabofs;
+    memcpy(strtab + strtabofs, name, len);
+    strtab[strtabofs+len] = 0;
+    strtabofs += len+1;
+  }
+  sym.value = value;
+  sym.sect = (int16_t)(sect+1);  /* 1-based section number. */
+  sym.type = (uint16_t)type;
+  sym.scl = (uint8_t)scl;
+  sym.naux = 0;
+  owrite(ctx, &sym, PEOBJ_SYM_SIZE);
+}
+
+/* Emit PE object section symbol. */
+static void emit_peobj_sym_sect(BuildCtx *ctx, PEsection *pesect, int sect)
+{
+  PEsym sym;
+  PEsymaux aux;
+  if (!strtab) return;  /* Pass 1: no output. */
+  memcpy(sym.n.name, pesect[sect].name, 8);
+  sym.value = 0;
+  sym.sect = (int16_t)(sect+1);  /* 1-based section number. */
+  sym.type = PEOBJ_TYPE_NULL;
+  sym.scl = PEOBJ_SCL_STATIC;
+  sym.naux = 1;
+  owrite(ctx, &sym, PEOBJ_SYM_SIZE);
+  memset(&aux, 0, sizeof(PEsymaux));
+  aux.size = pesect[sect].size;
+  aux.nreloc = pesect[sect].nreloc;
+  owrite(ctx, &aux, PEOBJ_SYM_SIZE);
+}
+
+#define emit_peobj_sym_func(ctx, name, ofs) \
+  emit_peobj_sym(ctx, name, (uint32_t)(ofs), \
+		 PEOBJ_SECT_TEXT, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN)
+#define emit_peobj_sym_rdata(ctx, name, ofs) \
+  emit_peobj_sym(ctx, name, (uint32_t)(ofs), \
+		 PEOBJ_SECT_RDATA, PEOBJ_TYPE_NULL, PEOBJ_SCL_EXTERN)
+
+/* Emit Windows PE object file. */
+void emit_peobj(BuildCtx *ctx)
+{
+  PEheader pehdr;
+  PEsection pesect[PEOBJ_NSECTIONS];
+  int nzsym, relocsyms;
+  uint32_t sofs;
+  int i;
+  union { uint8_t b; uint32_t u; } host_endian;
+
+  host_endian.u = 1;
+  if (host_endian.b != LJ_ENDIAN_SELECT(1, 0)) {
+    fprintf(stderr, "Error: different byte order for host and target\n");
+    exit(1);
+  }
+
+  sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection);
+
+  /* Fill in PE sections. */
+  memset(&pesect, 0, PEOBJ_NSECTIONS*sizeof(PEsection));
+  memcpy(pesect[PEOBJ_SECT_TEXT].name, ".text", sizeof(".text")-1);
+  pesect[PEOBJ_SECT_TEXT].ofs = sofs;
+  sofs += (pesect[PEOBJ_SECT_TEXT].size = (uint32_t)ctx->codesz);
+  pesect[PEOBJ_SECT_TEXT].relocofs = sofs;
+  sofs += (pesect[PEOBJ_SECT_TEXT].nreloc = (uint16_t)ctx->nreloc) * PEOBJ_RELOC_SIZE;
+  /* Flags: 60 = read+execute, 50 = align16, 20 = code. */
+  pesect[PEOBJ_SECT_TEXT].flags = 0x60500020;
+
+  memcpy(pesect[PEOBJ_SECT_RDATA].name, ".rdata", sizeof(".rdata")-1);
+  pesect[PEOBJ_SECT_RDATA].ofs = sofs;
+  sofs += (pesect[PEOBJ_SECT_RDATA].size = ctx->npc*sizeof(uint16_t));
+  /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
+  pesect[PEOBJ_SECT_RDATA].flags = 0x40300040;
+
+  memcpy(pesect[PEOBJ_SECT_RDATA_Z].name, ".rdata$Z", sizeof(".rdata$Z")-1);
+  pesect[PEOBJ_SECT_RDATA_Z].ofs = sofs;
+  sofs += (pesect[PEOBJ_SECT_RDATA_Z].size = (uint32_t)strlen(ctx->dasm_ident)+1);
+  /* Flags: 40 = read, 30 = align4, 40 = initialized data. */
+  pesect[PEOBJ_SECT_RDATA_Z].flags = 0x40300040;
+
+  /* Fill in PE header. */
+  pehdr.arch = PEOBJ_ARCH_TARGET;
+  pehdr.nsects = PEOBJ_NSECTIONS;
+  pehdr.time = 0;  /* Timestamp is optional. */
+  pehdr.symtabofs = sofs;
+  pehdr.opthdrsz = 0;
+  pehdr.flags = 0;
+
+  /* Compute the size of the symbol table:
+  ** @feat.00 + nsections*2
+  ** + asm_start + (nsyms-nzsym) + op_ofs
+  ** + relocsyms
+  */
+  /* Skip _Z syms. */
+  for (nzsym = 0; ctx->sym_ofs[ctx->perm[nzsym]] < 0; nzsym++) ;
+  for (relocsyms = 0; ctx->extnames[relocsyms]; relocsyms++) ;
+  pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+(ctx->nsym-nzsym)+1 + relocsyms;
+
+  /* Write PE object header and all sections. */
+  owrite(ctx, &pehdr, sizeof(PEheader));
+  owrite(ctx, &pesect, sizeof(PEsection)*PEOBJ_NSECTIONS);
+
+  /* Write .text section. */
+  owrite(ctx, ctx->code, ctx->codesz);
+  for (i = 0; i < ctx->nreloc; i++) {
+    PEreloc reloc;
+    reloc.vaddr = (uint32_t)ctx->reloc[i].ofs;
+    reloc.symidx = 1+2+ctx->reloc[i].sym;  /* Reloc syms are after .text sym. */
+    reloc.type = ctx->reloc[i].type ? PEOBJ_RELOC_REL32 : PEOBJ_RELOC_DIR32;
+    owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+  }
+
+  /* Write .rdata section. */
+  for (i = 0; i < ctx->npc; i++) {
+    uint16_t pcofs = (uint16_t)ctx->sym_ofs[i];
+    owrite(ctx, &pcofs, 2);
+  }
+
+  /* Write .rdata$Z section. */
+  owrite(ctx, ctx->dasm_ident, strlen(ctx->dasm_ident)+1);
+
+  /* Write symbol table. */
+  strtab = NULL;  /* 1st pass: collect string sizes. */
+  for (;;) {
+    char name[80];
+
+    strtabofs = 4;
+    /* Mark as SafeSEH compliant. */
+    emit_peobj_sym(ctx, "@feat.00", 1,
+		   PEOBJ_SECT_ABS, PEOBJ_TYPE_NULL, PEOBJ_SCL_STATIC);
+
+    emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_TEXT);
+    for (i = 0; ctx->extnames[i]; i++) {
+      sprintf(name, PEOBJ_SYM_PREFIX "%s", ctx->extnames[i]);
+      emit_peobj_sym(ctx, name, 0,
+		     PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN);
+    }
+    emit_peobj_sym_func(ctx, PEOBJ_SYM_PREFIX LABEL_ASM_BEGIN, 0);
+    for (i = nzsym; i < ctx->nsym; i++) {
+      int pi = ctx->perm[i];
+      if (pi >= ctx->npc) {
+	sprintf(name, PEOBJ_SYM_PREFIX LABEL_PREFIX "%s",
+		ctx->globnames[pi-ctx->npc]);
+	emit_peobj_sym_func(ctx, name, ctx->sym_ofs[pi]);
+#if LJ_HASJIT
+      } else {
+#else
+      } else if (!(pi == BC_JFORI || pi == BC_JFORL || pi == BC_JITERL ||
+		   pi == BC_JLOOP || pi == BC_IFORL || pi == BC_IITERL ||
+		   pi == BC_ILOOP)) {
+#endif
+	sprintf(name, PEOBJ_SYM_PREFIX LABEL_PREFIX_BC "%s",
+		bc_names[pi]);
+	emit_peobj_sym_func(ctx, name, ctx->sym_ofs[pi]);
+      }
+    }
+
+    emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_RDATA);
+    emit_peobj_sym_rdata(ctx, PEOBJ_SYM_PREFIX LABEL_OP_OFS, 0);
+
+    emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_RDATA_Z);
+
+    if (strtab)
+      break;
+    /* 2nd pass: alloc strtab, write syms and copy strings. */
+    strtab = (char *)malloc(strtabofs);
+    *(uint32_t *)strtab = strtabofs;
+  }
+
+  /* Write string table. */
+  owrite(ctx, strtab, strtabofs);
+}
+
+#endif

+ 3592 - 0
src/buildvm_x86.dasc

@@ -0,0 +1,3592 @@
+|// Low-level VM code for x86 CPUs.
+|// Bytecode interpreter, fast functions and helper functions.
+|// Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+|
+|.arch x86
+|.section code_op, code_sub
+|
+|.actionlist build_actionlist
+|.globals GLOB_
+|.globalnames globnames
+|.externnames extnames
+|
+|//-----------------------------------------------------------------------
+|
+|// Fixed register assignments for the interpreter.
+|// This is very fragile and has many dependencies. Caveat emptor.
+|.define BASE,		edx		// Not C callee-save, refetched anyway.
+|.define KBASE,		edi		// Must be C callee-save.
+|.define PC,		esi		// Must be C callee-save.
+|.define DISPATCH,	ebx		// Must be C callee-save.
+|
+|.define RA,		ecx
+|.define RAL,		cl
+|.define RB,		ebp		// Must be ebp (C callee-save).
+|.define RC,		eax		// Must be eax (fcomparepp and others).
+|.define RCW,		ax
+|.define RCH,		ah
+|.define RCL,		al
+|.define OP,		RB
+|.define RD,		RC
+|.define RDL,		RCL
+|
+|// Type definitions. Some of these are only used for documentation.
+|.type L,		lua_State
+|.type GL,		global_State
+|.type TVALUE,		TValue
+|.type GCOBJ,		GCobj
+|.type STR,		GCstr
+|.type TAB,		GCtab
+|.type LFUNC,		GCfuncL
+|.type CFUNC,		GCfuncC
+|.type PROTO,		GCproto
+|.type UPVAL,		GCupval
+|.type NODE,		Node
+|.type NARGS,		int
+|.type TRACE,		Trace
+|.type EXITINFO,	ExitInfo
+|
+|// Stack layout while in interpreter. Must match with lj_frame.h.
+|.macro saveregs
+|  push ebp; push edi; push esi; push ebx
+|.endmacro
+|.macro restoreregs
+|  pop ebx; pop esi; pop edi; pop ebp
+|.endmacro
+|.define CFRAME_SPACE,	aword*7			// Delta for esp (see <--).
+|
+|.define INARG_4,	aword [esp+aword*15]
+|.define INARG_3,	aword [esp+aword*14]
+|.define INARG_2,	aword [esp+aword*13]
+|.define INARG_1,	aword [esp+aword*12]
+|//----- 16 byte aligned, ^^^ arguments from C caller
+|.define SAVE_RET,	aword [esp+aword*11]	//<-- esp entering interpreter.
+|.define SAVE_R4,	aword [esp+aword*10]
+|.define SAVE_R3,	aword [esp+aword*9]
+|.define SAVE_R2,	aword [esp+aword*8]
+|//----- 16 byte aligned
+|.define SAVE_R1,	aword [esp+aword*7]	//<-- esp after register saves.
+|.define SAVE_PC,	aword [esp+aword*6]
+|.define ARG6,		aword [esp+aword*5]
+|.define ARG5,		aword [esp+aword*4]
+|//----- 16 byte aligned
+|.define ARG4,		aword [esp+aword*3]
+|.define ARG3,		aword [esp+aword*2]
+|.define ARG2,		aword [esp+aword*1]
+|.define ARG1,		aword [esp]		//<-- esp while in interpreter.
+|//----- 16 byte aligned, ^^^ arguments for C callee
+|
+|// FPARGx overlaps ARGx and ARG(x+1) on x86.
+|.define FPARG5,	qword [esp+qword*2]
+|.define FPARG3,	qword [esp+qword*1]
+|.define FPARG1,	qword [esp]
+|// NRESULTS overlaps ARG6 (and FPARG5)
+|.define NRESULTS,	ARG6
+|
+|// Arguments for vm_call and vm_pcall.
+|.define INARG_P_ERRF,	INARG_4			// vm_pcall only.
+|.define INARG_NRES,	INARG_3
+|.define INARG_BASE,	INARG_2
+|.define SAVE_L,	INARG_1
+|
+|.define SAVE_CFRAME,	INARG_BASE		// Overwrites INARG_BASE!
+|
+|// Arguments for vm_cpcall.
+|.define INARG_CP_UD,	INARG_4
+|.define INARG_CP_FUNC,	INARG_3
+|.define INARG_CP_CALL,	INARG_2
+|
+|//-----------------------------------------------------------------------
+|
+|// Instruction headers.
+|.macro ins_A; .endmacro
+|.macro ins_AD; .endmacro
+|.macro ins_AJ; .endmacro
+|.macro ins_ABC; movzx RB, RCH; movzx RC, RCL; .endmacro
+|.macro ins_AB_; movzx RB, RCH; .endmacro
+|.macro ins_A_C; movzx RC, RCL; .endmacro
+|.macro ins_AND; not RD; .endmacro
+|
+|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
+|.macro ins_NEXT
+|  mov RC, [PC]
+|  movzx RA, RCH
+|  movzx OP, RCL
+|  add PC, 4
+|  shr RC, 16
+|  jmp aword [DISPATCH+OP*4]
+|.endmacro
+|
+|// Instruction footer.
+|.if 1
+|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
+|  .define ins_next, ins_NEXT
+|  .define ins_next_, ins_NEXT
+|.else
+|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
+|  // Affects only certain kinds of benchmarks (and only with -j off).
+|  // Around 10%-30% slower on Core2, a lot more slower on P4.
+|  .macro ins_next
+|    jmp ->ins_next
+|  .endmacro
+|  .macro ins_next_
+|  ->ins_next:
+|    ins_NEXT
+|  .endmacro
+|.endif
+|
+|//-----------------------------------------------------------------------
+|
+|// Macros to test operand types.
+|.macro checktp, reg, tp;  cmp dword [BASE+reg*8+4], tp; .endmacro
+|.macro checknum, reg, target; checktp reg, LJ_TISNUM; ja target; .endmacro
+|.macro checkstr, reg, target; checktp reg, LJ_TSTR; jne target; .endmacro
+|.macro checktab, reg, target; checktp reg, LJ_TTAB; jne target; .endmacro
+|
+|// These operands must be used with movzx.
+|.define PC_OP, byte [PC-4]
+|.define PC_RA, byte [PC-3]
+|.define PC_RB, byte [PC-1]
+|.define PC_RC, byte [PC-2]
+|.define PC_RD, word [PC-2]
+|
+|.macro branchPC, reg
+|  lea PC, [PC+reg*4-BCBIAS_J*4]
+|.endmacro
+|
+|// Assumes DISPATCH is relative to GL.
+#define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
+#define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
+|
+|// Decrement hashed hotcount and trigger trace recorder if zero.
+|.macro hotloop, reg
+|  mov reg, PC
+|  shr reg, 1
+|  and reg, HOTCOUNT_PCMASK
+|  sub word [DISPATCH+reg+GG_DISP2HOT], 1
+|  jz ->vm_hotloop
+|.endmacro
+|
+|.macro hotcall, reg
+|  mov reg, PC
+|  shr reg, 1
+|  and reg, HOTCOUNT_PCMASK
+|  sub word [DISPATCH+reg+GG_DISP2HOT], 1
+|  jz ->vm_hotcall
+|.endmacro
+|
+|// Set current VM state.
+|.macro set_vmstate, st
+|  mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
+|.endmacro
+|
+|// Annoying x87 stuff: support for two compare variants.
+|.macro fcomparepp			// Compare and pop st0 >< st1.
+||if (cmov) {
+|  fucomip st1
+|  fpop
+||} else {
+|  fucompp
+|  fnstsw ax				// eax modified!
+|  sahf
+||}
+|.endmacro
+|
+|.macro fdup; fld st0; .endmacro
+|.macro fpop1; fstp st1; .endmacro
+|
+|// Move table write barrier back. Overwrites reg.
+|.macro barrierback, tab, reg
+|  and byte tab->marked, cast_byte(~LJ_GC_BLACK)	// black2gray(tab)
+|  mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
+|  mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
+|  mov tab->gclist, reg
+|.endmacro
+|
+|//-----------------------------------------------------------------------
+
+/* Generate subroutines used by opcodes and other parts of the VM. */
+/* The .code_sub section should be last to help static branch prediction. */
+static void build_subroutines(BuildCtx *ctx, int cmov)
+{
+  |.code_sub
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Call and return handling -------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// Reminder: A call gate may be called with func/args above L->maxstack,
+  |// i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
+  |// too. This means all call gates (L*, C and fast functions) must check
+  |// for stack overflow _before_ adding more slots!
+  |
+  |//-- Call gates ---------------------------------------------------------
+  |
+  |->gate_lf:				// Call gate for fixarg Lua functions.
+  |  // RA = new base, RB = LFUNC, RC = nargs+1, (BASE = old base), PC = return
+  |  // DISPATCH initialized
+  |  mov BASE, RA
+  |  mov PROTO:RB, LFUNC:RB->pt
+  |  mov [BASE-4], PC			// Store caller PC.
+  |  movzx RA, byte PROTO:RB->framesize
+  |  mov PC, PROTO:RB->bc
+  |  mov KBASE, PROTO:RB->k
+  |  mov L:RB, SAVE_L
+  |  lea RA, [BASE+RA*8]		// Top of frame.
+  |  lea RC, [BASE+NARGS:RC*8-4]	// Points to tag of 1st free slot.
+  |  cmp RA, L:RB->maxstack
+  |  ja ->gate_lf_growstack
+  |9:  // Entry point from vararg setup below.
+  |  mov RB, LJ_TNIL
+  |1:  // Clear free slots until top of frame.
+  |  mov [RC], RB
+  |  mov [RC+8], RB
+  |  add RC, 16
+  |  cmp RC, RA
+  |  jb <1
+#if LJ_HASJIT
+  |  // NYI: Disabled, until the tracer supports recursion/upcalls/leaves.
+  |  // hotcall RB
+#endif
+  |  ins_next
+  |
+  |->gate_lv:				// Call gate for vararg Lua functions.
+  |  // RA = new base, RB = LFUNC, RC = nargs+1, (BASE = old base), PC = return
+  |  // DISPATCH initialized
+  |  mov [RA-4], PC			// Store caller PC.
+  |  lea PC, [NARGS:RC*8+FRAME_VARG]
+  |  lea BASE, [RA+PC-FRAME_VARG]
+  |  mov [BASE-8], LFUNC:RB		// Store copy of LFUNC.
+  |  mov PROTO:RB, LFUNC:RB->pt
+  |  mov [BASE-4], PC			// Store delta + FRAME_VARG.
+  |  movzx PC, byte PROTO:RB->framesize
+  |  lea KBASE, [BASE+PC*8]
+  |  mov L:PC, SAVE_L
+  |  lea RC, [BASE+4]
+  |  cmp KBASE, L:PC->maxstack
+  |  ja ->gate_lv_growstack		// Need to grow stack.
+  |  movzx PC, byte PROTO:RB->numparams
+  |  test PC, PC
+  |  jz >2
+  |1:  // Copy fixarg slots up.
+  |  add RA, 8
+  |  cmp RA, BASE
+  |  jnb >2
+  |  mov KBASE, [RA-8]
+  |  mov [RC-4], KBASE
+  |  mov KBASE, [RA-4]
+  |  mov [RC], KBASE
+  |  add RC, 8
+  |  mov dword [RA-4], LJ_TNIL		// Clear old fixarg slot (help the GC).
+  |  sub PC, 1
+  |  jnz <1
+  |2:
+  |  movzx RA, byte PROTO:RB->framesize
+  |  mov PC, PROTO:RB->bc
+  |  mov KBASE, PROTO:RB->k
+  |  lea RA, [BASE+RA*8]
+  |  jmp <9
+  |
+  |->gate_c:				// Call gate for C functions.
+  |  // RA = new base, RB = CFUNC, RC = nargs+1, (BASE = old base), PC = return
+  |  mov [RA-4], PC
+  |  mov KBASE, CFUNC:RB->f
+  |  mov L:RB, SAVE_L
+  |  lea RC, [RA+NARGS:RC*8-8]
+  |  mov L:RB->base, RA
+  |  lea RA, [RC+8*LUA_MINSTACK]
+  |  mov ARG1, L:RB
+  |  mov L:RB->top, RC
+  |  cmp RA, L:RB->maxstack
+  |  ja ->gate_c_growstack		// Need to grow stack.
+  |  set_vmstate C
+  |  call KBASE				// (lua_State *L)
+  |  set_vmstate INTERP
+  |  // nresults returned in eax (RD).
+  |  mov BASE, L:RB->base
+  |  lea RA, [BASE+RD*8]
+  |  neg RA
+  |  add RA, L:RB->top			// RA = (L->top-(L->base+nresults))*8
+  |->vm_returnc:
+  |  add RD, 1				// RD = nresults+1
+  |  mov NRESULTS, RD
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z			// Handle regular return to Lua.
+  |
+  |//-- Return handling (non-inline) ---------------------------------------
+  |
+  |->vm_return:
+  |  // BASE = base, RA = resultofs, RD = nresults+1 (= NRESULTS), PC = return
+  |  test PC, FRAME_C
+  |  jz ->vm_returnp
+  |
+  |  // Return to C.
+  |  set_vmstate C
+  |  and PC, -8
+  |  sub PC, BASE
+  |  neg PC				// Previous base = BASE - delta.
+  |
+  |  sub RD, 1
+  |  jz >2
+  |1:
+  |  mov RB, [BASE+RA]			// Move results down.
+  |  mov [BASE-8], RB
+  |  mov RB, [BASE+RA+4]
+  |  mov [BASE-4], RB
+  |  add BASE, 8
+  |  sub RD, 1
+  |  jnz <1
+  |2:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, PC
+  |3:
+  |  mov RD, NRESULTS
+  |  mov RA, INARG_NRES			// RA = wanted nresults+1
+  |4:
+  |  cmp RA, RD
+  |  jne >6				// More/less results wanted?
+  |5:
+  |  sub BASE, 8
+  |  mov L:RB->top, BASE
+  |
+  |->vm_leave_cp:
+  |  mov RA, SAVE_CFRAME		// Restore previous C frame.
+  |  mov L:RB->cframe, RA
+  |  xor eax, eax			// Ok return status for vm_pcall.
+  |
+  |->vm_leave_unw:
+  |  add esp, CFRAME_SPACE
+  |  restoreregs
+  |  ret
+  |
+  |6:
+  |  jb >7				// Less results wanted?
+  |  // More results wanted. Check stack size and fill up results with nil.
+  |  cmp BASE, L:RB->maxstack
+  |  ja >8
+  |  mov dword [BASE-4], LJ_TNIL
+  |  add BASE, 8
+  |  add RD, 1
+  |  jmp <4
+  |
+  |7:  // Less results wanted.
+  |  test RA, RA
+  |  jz <5				// But check for LUA_MULTRET+1.
+  |  sub RA, RD				// Negative result!
+  |  lea BASE, [BASE+RA*8]		// Correct top.
+  |  jmp <5
+  |
+  |8:  // Corner case: need to grow stack for filling up results.
+  |  // This can happen if:
+  |  // - A C function grows the stack (a lot).
+  |  // - The GC shrinks the stack in between.
+  |  // - A return back from a lua_call() with (high) nresults adjustment.
+  |  mov L:RB->top, BASE		// Save current top held in BASE (yes).
+  |  mov NRESULTS, RD			// Need to fill only remainder with nil.
+  |  mov ARG2, RA			// Grow by wanted nresults+1.
+  |  mov ARG1, L:RB
+  |  call extern lj_state_growstack	// (lua_State *L, int n)
+  |  mov BASE, L:RB->top		// Need the (realloced) L->top in BASE.
+  |  jmp <3
+  |
+  |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
+  |  // (void *cframe, int errcode)
+  |  mov ecx, [esp+4]
+  |  mov eax, [esp+8]			// Error return status for vm_pcall.
+  |  and ecx, CFRAME_RAWMASK
+  |  mov esp, ecx
+  |  mov L:RB, SAVE_L
+  |  mov GL:RB, L:RB->glref
+  |  mov dword GL:RB->vmstate, ~LJ_VMST_C
+  |  jmp ->vm_leave_unw
+  |
+  |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
+  |  mov ecx, [esp+4]
+  |  and ecx, CFRAME_RAWMASK
+  |  mov esp, ecx
+  |  mov L:RB, SAVE_L
+  |  mov RA, -8				// Results start at BASE+RA = BASE-8.
+  |  mov RD, 1+1			// Really 1+2 results, incr. later.
+  |  mov BASE, L:RB->base
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  add DISPATCH, GG_G2DISP
+  |  mov PC, [BASE-4]			// Fetch PC of previous frame.
+  |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to error message.
+  |  set_vmstate INTERP
+  |  jmp ->vm_returnc			// Increments RD/NRESULTS and returns.
+  |
+  |->vm_returnp:
+  |  test PC, FRAME_P
+  |  jz ->cont_dispatch
+  |
+  |  // Return from pcall or xpcall fast func.
+  |  and PC, -8
+  |  sub BASE, PC			// Restore caller base.
+  |  lea RA, [RA+PC-8]			// Rebase RA and prepend one result.
+  |  mov PC, [BASE-4]			// Fetch PC of previous frame.
+  |  // Prepending may overwrite the pcall frame, so do it at the end.
+  |  mov dword [BASE+RA+4], LJ_TTRUE	// Prepend true to results.
+  |  jmp ->vm_returnc			// Increments RD/NRESULTS and returns.
+  |
+  |//-- Grow stack on-demand -----------------------------------------------
+  |
+  |->gate_c_growstack:			// Grow stack for C function.
+  |  mov ARG2, LUA_MINSTACK
+  |  jmp >1
+  |
+  |->gate_lv_growstack:			// Grow stack for vararg Lua function.
+  |  sub RC, 8
+  |  mov BASE, RA
+  |  mov RA, KBASE
+  |  mov PC, PROTO:RB->bc
+  |  mov L:RB, SAVE_L
+  |
+  |->gate_lf_growstack:			// Grow stack for fixarg Lua function.
+  |  // BASE = new base, RA = requested top, RC = top (offset +4 bytes)
+  |  // RB = L, PC = first PC of called function (or anything if C function)
+  |  sub RC, 4				// Adjust top.
+  |  sub RA, BASE
+  |  shr RA, 3				// n = pt->framesize - L->top
+  |  add PC, 4				// Must point after first instruction.
+  |  mov L:RB->base, BASE
+  |  mov L:RB->top, RC
+  |  mov SAVE_PC, PC
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |1:
+  |  // L:RB = L, L->base = new base, L->top = top
+  |  // SAVE_PC = initial PC+1 (undefined for C functions)
+  |  call extern lj_state_growstack	// (lua_State *L, int n)
+  |  mov RA, L:RB->base
+  |  mov RC, L:RB->top
+  |  mov LFUNC:RB, [RA-8]
+  |  mov PC, [RA-4]
+  |  sub RC, RA
+  |  shr RC, 3
+  |  add NARGS:RC, 1
+  |  // RA = new base, RB = LFUNC, RC = nargs+1, (BASE = invalid), PC restored.
+  |  jmp aword LFUNC:RB->gate		// Just retry call.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Entry points into the assembler VM ---------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_resume:				// Setup C frame and resume thread.
+  |  // (lua_State *L, StkId base, int nres1 = 0, ptrdiff_t ef = 0)
+  |  saveregs
+  |  mov PC, FRAME_C
+  |  sub esp, CFRAME_SPACE
+  |  xor RD, RD
+  |  mov L:RB, SAVE_L
+  |  lea KBASE, [esp+CFRAME_RESUME]
+  |  mov RA, INARG_BASE
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  add DISPATCH, GG_G2DISP
+  |  mov L:RB->cframe, KBASE
+  |  mov SAVE_CFRAME, RD		// Caveat: overlaps INARG_BASE!
+  |  mov SAVE_PC, RD			// Any value outside of bytecode is ok.
+  |  cmp byte L:RB->status, RDL
+  |  je >3				// Initial resume (like a call).
+  |
+  |  // Resume after yield (like a return).
+  |  set_vmstate INTERP
+  |  mov byte L:RB->status, RDL
+  |  mov BASE, L:RB->base
+  |  mov RD, L:RB->top
+  |  sub RD, RA
+  |  shr RD, 3
+  |  add RD, 1				// RD = nresults+1
+  |  sub RA, BASE			// RA = resultofs
+  |  mov PC, [BASE-4]
+  |  mov NRESULTS, RD
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z
+  |  jmp ->vm_return
+  |
+  |->vm_pcall:				// Setup protected C frame and enter VM.
+  |  // (lua_State *L, StkId base, int nres1, ptrdiff_t ef)
+  |  saveregs
+  |  mov PC, FRAME_CP
+  |  jmp >1
+  |
+  |->vm_call:				// Setup C frame and enter VM.
+  |  // (lua_State *L, StkId base, int nres1)
+  |  saveregs
+  |  mov PC, FRAME_C
+  |
+  |1:  // Entry point for vm_pcall above (PC = ftype).
+  |  sub esp, CFRAME_SPACE
+  |  mov L:RB, SAVE_L
+  |  mov RA, INARG_BASE
+  |
+  |2:  // Entry point for vm_cpcall below (RA = base, RB = L, PC = ftype).
+  |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
+  |  mov SAVE_CFRAME, KBASE		// Caveat: overlaps INARG_BASE!
+  |  mov SAVE_PC, esp			// Any value outside of bytecode is ok.
+  |  mov L:RB->cframe, esp
+  |
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  add DISPATCH, GG_G2DISP
+  |
+  |3:  // Entry point for vm_resume above (RA = base, RB = L, PC = ftype).
+  |  set_vmstate INTERP
+  |  mov BASE, L:RB->base		// BASE = old base (used in vmeta_call).
+  |  add PC, RA
+  |  sub PC, BASE			// PC = frame delta + frame type
+  |
+  |  mov RC, L:RB->top
+  |  sub RC, RA
+  |  shr NARGS:RC, 3
+  |  add NARGS:RC, 1			// RC = nargs+1
+  |
+  |  mov LFUNC:RB, [RA-8]
+  |  cmp dword [RA-4], LJ_TFUNC
+  |  jne ->vmeta_call			// Ensure KBASE defined and != BASE.
+  |  jmp aword LFUNC:RB->gate
+  |  // RA = new base, RB = LFUNC/CFUNC, RC = nargs+1.
+  |
+  |->vm_cpcall:				// Setup protected C frame, call C.
+  |  // (lua_State *L, lua_CPFunction cp, lua_CFunction func, void *ud)
+  |  saveregs
+  |  sub esp, CFRAME_SPACE
+  |
+  |  mov L:RB, SAVE_L
+  |  mov RC, INARG_CP_UD
+  |  mov RA, INARG_CP_FUNC
+  |  mov BASE, INARG_CP_CALL
+  |  mov SAVE_PC, esp			// Any value outside of bytecode is ok.
+  |
+  |  // Caveat: INARG_P_* and INARG_CP_* overlap!
+  |  mov KBASE, L:RB->stack		// Compute -savestack(L, L->top).
+  |  sub KBASE, L:RB->top
+  |  mov INARG_P_ERRF, 0		// No error function.
+  |  mov INARG_NRES, KBASE		// Neg. delta means cframe w/o frame.
+  |  // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
+  |
+  |  mov ARG3, RC
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |
+  |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
+  |  mov SAVE_CFRAME, KBASE		// Caveat: overlaps INARG_CP_CALL!
+  |  mov L:RB->cframe, esp
+  |
+  |  call BASE			// (lua_State *L, lua_CFunction func, void *ud)
+  |  // StkId (new base) or NULL returned in eax (RC).
+  |  test RC, RC
+  |  jz ->vm_leave_cp			// No base? Just remove C frame.
+  |  mov RA, RC
+  |  mov PC, FRAME_CP
+  |  jmp <2				// Else continue with the call.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Metamethod handling ------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |//-- Continuation dispatch ----------------------------------------------
+  |
+  |->cont_dispatch:
+  |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in NRESULTS)
+  |  add RA, BASE
+  |  and PC, -8
+  |  mov RB, BASE
+  |  sub BASE, PC			// Restore caller BASE.
+  |  mov dword [RA+RD*8-4], LJ_TNIL	// Ensure one valid arg.
+  |  mov RC, RA				// ... in [RC]
+  |  mov PC, [RB-12]			// Restore PC from [cont|PC].
+  |  mov LFUNC:KBASE, [BASE-8]
+  |  mov PROTO:KBASE, LFUNC:KBASE->pt
+  |  mov KBASE, PROTO:KBASE->k
+  |  // BASE = base, RC = result, RB = meta base
+  |  jmp dword [RB-16]			// Jump to continuation.
+  |
+  |->cont_cat:				// BASE = base, RC = result, RB = mbase
+  |  movzx RA, PC_RB
+  |  sub RB, 16
+  |  lea RA, [BASE+RA*8]
+  |  sub RA, RB
+  |  je ->cont_ra
+  |  neg RA
+  |  shr RA, 3
+  |  mov ARG3, RA
+  |  mov RA, [RC+4]
+  |  mov RC, [RC]
+  |  mov [RB+4], RA
+  |  mov [RB], RC
+  |  mov ARG2, RB
+  |  jmp ->BC_CAT_Z
+  |
+  |//-- Table indexing metamethods -----------------------------------------
+  |
+  |->vmeta_tgets:
+  |  mov ARG5, RC			// RC = GCstr *
+  |  mov ARG6, LJ_TSTR
+  |  lea RC, ARG5			// Store temp. TValue in ARG5/ARG6.
+  |  cmp PC_OP, BC_GGET
+  |  jne >1
+  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
+  |  mov [RA], TAB:RB			// RB = GCtab *
+  |  mov dword [RA+4], LJ_TTAB
+  |  mov RB, RA
+  |  jmp >2
+  |
+  |->vmeta_tgetb:
+  |  movzx RC, PC_RC			// Ugly, cannot fild from a byte.
+  |  mov ARG4, RC
+  |  fild ARG4
+  |  fstp FPARG5
+  |  lea RC, ARG5			// Store temp. TValue in ARG5/ARG6.
+  |  jmp >1
+  |
+  |->vmeta_tgetv:
+  |  movzx RC, PC_RC			// Reload TValue *k from RC.
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  movzx RB, PC_RB			// Reload TValue *t from RB.
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |  mov ARG2, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
+  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz >3
+  |->cont_ra:				// BASE = base, RC = result
+  |  movzx RA, PC_RA
+  |  mov RB, [RC+4]
+  |  mov RC, [RC]
+  |  mov [BASE+RA*8+4], RB
+  |  mov [BASE+RA*8], RC
+  |  ins_next
+  |
+  |3:  // Call __index metamethod.
+  |  // BASE = base, L->top = new base, stack = cont/func/t/k
+  |  mov RA, L:RB->top
+  |  mov [RA-12], PC			// [cont|PC]
+  |  lea PC, [RA+FRAME_CONT]
+  |  sub PC, BASE
+  |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
+  |  mov NARGS:RC, 3			// 2+1 args for func(t, k).
+  |  jmp aword LFUNC:RB->gate
+  |
+  |//-----------------------------------------------------------------------
+  |
+  |->vmeta_tsets:
+  |  mov ARG5, RC			// RC = GCstr *
+  |  mov ARG6, LJ_TSTR
+  |  lea RC, ARG5			// Store temp. TValue in ARG5/ARG6.
+  |  cmp PC_OP, BC_GSET
+  |  jne >1
+  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
+  |  mov [RA], TAB:RB			// RB = GCtab *
+  |  mov dword [RA+4], LJ_TTAB
+  |  mov RB, RA
+  |  jmp >2
+  |
+  |->vmeta_tsetb:
+  |  movzx RC, PC_RC			// Ugly, cannot fild from a byte.
+  |  mov ARG4, RC
+  |  fild ARG4
+  |  fstp FPARG5
+  |  lea RC, ARG5			// Store temp. TValue in ARG5/ARG6.
+  |  jmp >1
+  |
+  |->vmeta_tsetv:
+  |  movzx RC, PC_RC			// Reload TValue *k from RC.
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  movzx RB, PC_RB			// Reload TValue *t from RB.
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |  mov ARG2, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
+  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz >3
+  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
+  |  movzx RA, PC_RA
+  |  mov RB, [BASE+RA*8+4]
+  |  mov RA, [BASE+RA*8]
+  |  mov [RC+4], RB
+  |  mov [RC], RA
+  |->cont_nop:				// BASE = base, (RC = result)
+  |  ins_next
+  |
+  |3:  // Call __newindex metamethod.
+  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
+  |  mov RA, L:RB->top
+  |  mov [RA-12], PC			// [cont|PC]
+  |  movzx RC, PC_RA
+  |  mov RB, [BASE+RC*8+4]		// Copy value to third argument.
+  |  mov RC, [BASE+RC*8]
+  |  mov [RA+20], RB
+  |  mov [RA+16], RC
+  |  lea PC, [RA+FRAME_CONT]
+  |  sub PC, BASE
+  |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
+  |  mov NARGS:RC, 4			// 3+1 args for func(t, k, v).
+  |  jmp aword LFUNC:RB->gate
+  |
+  |//-- Comparison metamethods ---------------------------------------------
+  |
+  |->vmeta_comp:
+  |  movzx RB, PC_OP
+  |  lea RD, [BASE+RD*8]
+  |  lea RA, [BASE+RA*8]
+  |  mov ARG4, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RD
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
+  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
+  |3:
+  |  mov BASE, L:RB->base
+  |  cmp RC, 1
+  |  ja ->vmeta_binop
+  |4:
+  |  lea PC, [PC+4]
+  |  jb >6
+  |5:
+  |  movzx RD, PC_RD
+  |  branchPC RD
+  |6:
+  |  ins_next
+  |
+  |->cont_condt:			// BASE = base, RC = result
+  |  add PC, 4
+  |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is true.
+  |  jb <5
+  |  jmp <6
+  |
+  |->cont_condf:			// BASE = base, RC = result
+  |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is false.
+  |  jmp <4
+  |
+  |->vmeta_equal:
+  |  mov ARG4, RB
+  |  mov L:RB, SAVE_L
+  |  sub PC, 4
+  |  mov ARG3, RD
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_equal	// (lua_State *L, GCobj *o1, *o2, int ne)
+  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
+  |  jmp <3
+  |
+  |//-- Arithmetic metamethods ---------------------------------------------
+  |
+  |->vmeta_arith_vn:
+  |  lea RC, [KBASE+RC*8]
+  |  jmp >1
+  |
+  |->vmeta_arith_nv:
+  |  lea RC, [KBASE+RC*8]
+  |  lea RB, [BASE+RB*8]
+  |  xchg RB, RC
+  |  jmp >2
+  |
+  |->vmeta_unm:
+  |  lea RC, [BASE+RD*8]
+  |  mov RB, RC
+  |  jmp >2
+  |
+  |->vmeta_arith_vv:
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |  lea RA, [BASE+RA*8]
+  |  mov ARG3, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG4, RC
+  |  movzx RC, PC_OP
+  |  mov ARG2, RA
+  |  mov ARG5, RC
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_arith	// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
+  |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz ->cont_nop
+  |
+  |  // Call metamethod for binary op.
+  |->vmeta_binop:
+  |  // BASE = base, RC = new base, stack = cont/func/o1/o2
+  |  mov RA, RC
+  |  sub RC, BASE
+  |  mov [RA-12], PC			// [cont|PC]
+  |  lea PC, [RC+FRAME_CONT]
+  |  mov LFUNC:RB, [RA-8]
+  |  mov NARGS:RC, 3			// 2+1 args for func(o1, o2).
+  |  cmp dword [RA-4], LJ_TFUNC
+  |  jne ->vmeta_call
+  |  jmp aword LFUNC:RB->gate
+  |
+  |->vmeta_len:
+  |  lea RD, [BASE+RD*8]
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, RD
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_len		// (lua_State *L, TValue *o)
+  |  // TValue * (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  jmp ->vmeta_binop			// Binop call for compatibility.
+  |
+  |//-- Call metamethod ----------------------------------------------------
+  |
+  |->vmeta_call:			// Resolve and call __call metamethod.
+  |  // RA = new base, RC = nargs+1, BASE = old base, PC = return
+  |  mov ARG4, RA			// Save RA, RC for us.
+  |  mov ARG5, NARGS:RC
+  |  sub RA, 8
+  |  lea RC, [RA+NARGS:RC*8]
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, RA
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE		// This is the callers base!
+  |  call extern lj_meta_call	// (lua_State *L, TValue *func, TValue *top)
+  |  mov BASE, L:RB->base
+  |  mov RA, ARG4
+  |  mov NARGS:RC, ARG5
+  |  mov LFUNC:RB, [RA-8]
+  |  add NARGS:RC, 1
+  |  // This is fragile. L->base must not move, KBASE must always be defined.
+  |  cmp KBASE, BASE			// Continue with CALLT if flag set.
+  |  je ->BC_CALLT_Z
+  |  jmp aword LFUNC:RB->gate		// Otherwise call resolved metamethod.
+  |
+  |//-- Argument coercion for 'for' statement ------------------------------
+  |
+  |->vmeta_for:
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_meta_for	// (lua_State *L, StkId base)
+  |  mov BASE, L:RB->base
+  |  mov RC, [PC-4]
+  |  movzx RA, RCH
+  |  movzx OP, RCL
+  |  shr RC, 16
+  |  jmp aword [DISPATCH+OP*4+GG_DISP_STATIC*4]	// Retry FORI or JFORI.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Fast functions -----------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |.macro .ffunc, name
+  |->ff_ .. name:
+  |.endmacro
+  |
+  |.macro .ffunc_1, name
+  |->ff_ .. name:
+  |  cmp NARGS:RC, 1+1;  jb ->fff_fallback
+  |.endmacro
+  |
+  |.macro .ffunc_2, name
+  |->ff_ .. name:
+  |  cmp NARGS:RC, 2+1;  jb ->fff_fallback
+  |.endmacro
+  |
+  |.macro .ffunc_n, name
+  |  .ffunc_1 name
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA]
+  |.endmacro
+  |
+  |.macro .ffunc_n, name, op
+  |  .ffunc_1 name
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  op
+  |  fld qword [RA]
+  |.endmacro
+  |
+  |.macro .ffunc_nn, name
+  |  .ffunc_2 name
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  cmp dword [RA+12], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA]
+  |  fld qword [RA+8]
+  |.endmacro
+  |
+  |.macro .ffunc_nnr, name
+  |  .ffunc_2 name
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  cmp dword [RA+12], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA+8]
+  |  fld qword [RA]
+  |.endmacro
+  |
+  |// Inlined GC threshold check. Caveat: uses label 1.
+  |.macro ffgccheck
+  |  mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
+  |  cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
+  |  jb >1
+  |  call ->fff_gcstep
+  |1:
+  |.endmacro
+  |
+  |//-- Base library: checks -----------------------------------------------
+  |
+  |.ffunc_1 assert
+  |  mov RB, [RA+4]
+  |  cmp RB, LJ_TISTRUECOND;  jae ->fff_fallback
+  |  mov NRESULTS, RD
+  |  mov [RA-4], RB
+  |  mov RB, [RA]
+  |  mov [RA-8], RB
+  |  sub RD, 2
+  |  jz >2
+  |  mov ARG1, RA
+  |1:
+  |  add RA, 8
+  |  mov RB, [RA+4]
+  |  mov [RA-4], RB
+  |  mov RB, [RA]
+  |  mov [RA-8], RB
+  |  sub RD, 1
+  |  jnz <1
+  |  mov RA, ARG1
+  |2:
+  |  mov RD, NRESULTS
+  |  jmp ->fff_res_
+  |
+  |.ffunc_1 type
+  |  mov RB, [RA+4]
+  |  mov RC, ~LJ_TNUMX
+  |  not RB
+  |  cmp RC, RB
+  ||if (cmov) {
+  |  cmova RC, RB
+  ||} else {
+  |  jbe >1; mov RC, RB; 1:
+  ||}
+  |  mov CFUNC:RB, [RA-8]
+  |  mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
+  |  mov dword [RA-4], LJ_TSTR
+  |  mov [RA-8], STR:RC
+  |  jmp ->fff_res1
+  |
+  |//-- Base library: getters and setters ---------------------------------
+  |
+  |.ffunc_1 getmetatable
+  |  mov RB, [RA+4]
+  |  cmp RB, LJ_TTAB;  jne >6
+  |1:  // Field metatable must be at same offset for GCtab and GCudata!
+  |  mov TAB:RB, [RA]
+  |  mov TAB:RB, TAB:RB->metatable
+  |2:
+  |  test TAB:RB, TAB:RB
+  |  mov dword [RA-4], LJ_TNIL
+  |  jz ->fff_res1
+  |  mov CFUNC:RC, [RA-8]
+  |  mov STR:RC, [DISPATCH+DISPATCH_GL(mmname)+4*MM_metatable]
+  |  mov dword [RA-4], LJ_TTAB		// Store metatable as default result.
+  |  mov [RA-8], TAB:RB
+  |  mov ARG1, RA			// Save result pointer.
+  |  mov RA, TAB:RB->hmask
+  |  and RA, STR:RC->hash
+  |  imul RA, #NODE
+  |  add NODE:RA, TAB:RB->node
+  |3:  // Rearranged logic, because we expect _not_ to find the key.
+  |  cmp dword NODE:RA->key.it, LJ_TSTR
+  |  jne >4
+  |  cmp dword NODE:RA->key.gcr, STR:RC
+  |  je >5
+  |4:
+  |  mov NODE:RA, NODE:RA->next
+  |  test NODE:RA, NODE:RA
+  |  jnz <3
+  |  jmp ->fff_res1			// Not found, keep default result.
+  |5:
+  |  mov RB, [RA+4]
+  |  cmp RB, LJ_TNIL;  je ->fff_res1	// Dito for nil value.
+  |  mov RC, [RA]
+  |  mov RA, ARG1			// Restore result pointer.
+  |  mov [RA-4], RB			// Return value of mt.__metatable.
+  |  mov [RA-8], RC
+  |  jmp ->fff_res1
+  |
+  |6:
+  |  cmp RB, LJ_TUDATA;  je <1
+  |  cmp RB, LJ_TISNUM;  ja >7
+  |  mov RB, LJ_TNUMX
+  |7:
+  |  not RB
+  |  mov TAB:RB, [DISPATCH+RB*4+DISPATCH_GL(basemt)]
+  |  jmp <2
+  |
+  |.ffunc_2 setmetatable
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  // Fast path: no mt for table yet and not clearing the mt.
+  |  mov TAB:RB, [RA]
+  |  cmp dword TAB:RB->metatable, 0;  jne ->fff_fallback
+  |  cmp dword [RA+12], LJ_TTAB;  jne ->fff_fallback
+  |  mov TAB:RC, [RA+8]
+  |  mov TAB:RB->metatable, TAB:RC
+  |  mov dword [RA-4], LJ_TTAB		// Return original table.
+  |  mov [RA-8], TAB:RB
+  |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+  |  jz >1
+  |  // Possible write barrier. Table is black, but skip iswhite(mt) check.
+  |  barrierback TAB:RB, RC
+  |1:
+  |  jmp ->fff_res1
+  |
+  |.ffunc_2 rawget
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov TAB:RC, [RA]
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, TAB:RC
+  |  mov ARG1, L:RB
+  |  mov RB, RA
+  |  mov ARG4, BASE			// Save BASE and RA.
+  |  add RA, 8
+  |  mov ARG3, RA
+  |  call extern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
+  |  // cTValue * returned in eax (RC).
+  |  mov RA, RB
+  |  mov BASE, ARG4
+  |  mov RB, [RC]			// Copy table slot.
+  |  mov RC, [RC+4]
+  |  mov [RA-8], RB
+  |  mov [RA-4], RC
+  |  jmp ->fff_res1
+  |
+  |//-- Base library: conversions ------------------------------------------
+  |
+  |.ffunc tonumber
+  |  // Only handles the number case inline (without a base argument).
+  |  cmp NARGS:RC, 1+1;  jne ->fff_fallback	// Exactly one argument.
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA]
+  |  jmp ->fff_resn
+  |
+  |.ffunc_1 tostring
+  |  // Only handles the string or number case inline.
+  |  cmp dword [RA+4], LJ_TSTR;  jne >3
+  |  // A __tostring method in the string base metatable is ignored.
+  |  mov STR:RC, [RA]
+  |2:
+  |  mov dword [RA-4], LJ_TSTR
+  |  mov [RA-8], STR:RC
+  |  jmp ->fff_res1
+  |3:  // Handle numbers inline, unless a number base metatable is present.
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  cmp dword [DISPATCH+DISPATCH_GL(basemt)+4*(~LJ_TNUMX)], 0
+  |  jne ->fff_fallback
+  |  ffgccheck				// Caveat: uses label 1.
+  |  mov L:RB, SAVE_L
+  |  mov ARG1, L:RB
+  |  mov ARG2, RA
+  |  mov L:RB->base, RA			// Add frame since C call can throw.
+  |  mov [RA-4], PC
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov ARG3, BASE			// Save BASE.
+  |  call extern lj_str_fromnum		// (lua_State *L, lua_Number *np)
+  |  // GCstr returned in eax (RC).
+  |  mov RA, L:RB->base
+  |  mov BASE, ARG3
+  |  jmp <2
+  |
+  |//-- Base library: iterators -------------------------------------------
+  |
+  |.ffunc_1 next
+  |  je >2				// Missing 2nd arg?
+  |1:
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov TAB:RB, [RA]
+  |  mov ARG2, TAB:RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, RA			// Add frame since C call can throw.
+  |  mov [RA-4], PC
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov ARG4, BASE			// Save BASE.
+  |  add RA, 8
+  |  mov ARG3, RA
+  |  call extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
+  |  // Flag returned in eax (RC).
+  |  mov RA, L:RB->base
+  |  mov BASE, ARG4
+  |  test RC, RC;  jz >3		// End of traversal?
+  |  mov RB, [RA+8]			// Copy key and value to results.
+  |  mov RC, [RA+12]
+  |  mov [RA-8], RB
+  |  mov [RA-4], RC
+  |  mov RB, [RA+16]
+  |  mov RC, [RA+20]
+  |  mov [RA], RB
+  |  mov [RA+4], RC
+  |->fff_res2:
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |2:  // Set missing 2nd arg to nil.
+  |  mov dword [RA+12], LJ_TNIL
+  |  jmp <1
+  |3:  // End of traversal: return nil.
+  |  mov dword [RA-4], LJ_TNIL
+  |  jmp ->fff_res1
+  |
+  |.ffunc_1 pairs
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov CFUNC:RC, CFUNC:RB->upvalue[0]
+  |  mov dword [RA-4], LJ_TFUNC
+  |  mov [RA-8], CFUNC:RC
+  |  mov dword [RA+12], LJ_TNIL
+  |  mov RD, 1+3
+  |  jmp ->fff_res
+  |
+  |.ffunc_1 ipairs_aux
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  cmp dword [RA+12], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA+8]
+  |  fld1
+  |  faddp st1
+  |  fist ARG2
+  |  fstp qword [RA-8]
+  |  mov TAB:RB, [RA]
+  |  mov RC, ARG2
+  |  cmp RC, TAB:RB->asize;  jae >2	// Not in array part?
+  |  shl RC, 3
+  |  add RC, TAB:RB->array
+  |1:
+  |  cmp dword [RC+4], LJ_TNIL;  je ->fff_res0
+  |  mov RB, [RC]			// Copy array slot.
+  |  mov RC, [RC+4]
+  |  mov [RA], RB
+  |  mov [RA+4], RC
+  |  jmp ->fff_res2
+  |2:  // Check for empty hash part first. Otherwise call C function.
+  |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
+  |  mov ARG1, TAB:RB
+  |  mov ARG3, BASE			// Save BASE and RA.
+  |  mov RB, RA
+  |  call extern lj_tab_getinth		// (GCtab *t, int32_t key)
+  |  // cTValue * or NULL returned in eax (RC).
+  |  mov RA, RB
+  |  mov BASE, ARG3
+  |  test RC, RC
+  |  jnz <1
+  |->fff_res0:
+  |  mov RD, 1+0
+  |  jmp ->fff_res
+  |
+  |.ffunc_1 ipairs
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov CFUNC:RC, CFUNC:RB->upvalue[0]
+  |  mov dword [RA-4], LJ_TFUNC
+  |  mov [RA-8], CFUNC:RC
+  |  fldz
+  |  fstp qword [RA+8]
+  |  mov RD, 1+3
+  |  jmp ->fff_res
+  |
+  |//-- Base library: catch errors ----------------------------------------
+  |
+  |.ffunc_1 pcall
+  |  mov [RA-4], PC
+  |  mov PC, 8+FRAME_PCALL
+  |  mov BASE, RA
+  |  add RA, 8
+  |  sub NARGS:RC, 1
+  |  mov LFUNC:RB, [RA-8]
+  |1:
+  |  test byte [DISPATCH+DISPATCH_GL(hookmask)], HOOK_ACTIVE
+  |  jnz >3				// Hook active before pcall?
+  |2:
+  |  cmp dword [RA-4], LJ_TFUNC
+  |  jne ->vmeta_call			// Ensure KBASE defined and != BASE.
+  |  jmp aword LFUNC:RB->gate
+  |3:
+  |  add PC, 1				// Use FRAME_PCALLH if hook was active.
+  |  jmp <2
+  |
+  |.ffunc_2 xpcall
+  |  cmp dword [RA+12], LJ_TFUNC;  jne ->fff_fallback
+  |  mov [RA-4], PC
+  |  mov RB, [RA+4]			// Swap function and traceback.
+  |  mov [RA+12], RB
+  |  mov dword [RA+4], LJ_TFUNC
+  |  mov LFUNC:RB, [RA]
+  |  mov PC, [RA+8]
+  |  mov [RA+8], LFUNC:RB
+  |  mov [RA], PC
+  |  mov PC, 2*8+FRAME_PCALL
+  |  mov BASE, RA
+  |  add RA, 2*8
+  |  sub NARGS:RC, 2
+  |  jmp <1
+  |
+  |//-- Coroutine library --------------------------------------------------
+  |
+  |.macro coroutine_resume_wrap, resume
+  |9:  // Need to restore PC for fallback handler.
+  |  mov PC, SAVE_PC
+  |  jmp ->fff_fallback
+  |
+  |.if resume
+  |.ffunc_1 coroutine_resume
+  |  mov L:RB, [RA]
+  |.else
+  |.ffunc coroutine_wrap_aux
+  |  mov L:RB, CFUNC:RB->upvalue[0].gcr
+  |.endif
+  |  mov [RA-4], PC
+  |  mov SAVE_PC, PC
+  |  mov ARG1, L:RB
+  |.if resume
+  |  cmp dword [RA+4], LJ_TTHREAD;  jne <9
+  |.endif
+  |  cmp aword L:RB->cframe, 0; jne <9
+  |  cmp byte L:RB->status, LUA_YIELD;  ja <9
+  |  mov PC, L:RB->top
+  |  mov ARG2, PC
+  |  je >1				// Status != LUA_YIELD (i.e. 0)?
+  |  cmp PC, L:RB->base; je <9		// Check for presence of initial func.
+  |1:
+  |.if resume
+  |  lea PC, [PC+NARGS:RC*8-16]		// Check stack space (-1-thread).
+  |.else
+  |  lea PC, [PC+NARGS:RC*8-8]		// Check stack space (-1).
+  |.endif
+  |  cmp PC, L:RB->maxstack; ja <9
+  |  mov L:RB->top, PC
+  |
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, RA
+  |.if resume
+  |  add RA, 8				// Keep resumed thread in stack for GC.
+  |.endif
+  |  mov L:RB->top, RA
+  |  mov RB, ARG2
+  |.if resume
+  |  lea RA, [RA+NARGS:RC*8-24]		// RA = end of source for stack move.
+  |.else
+  |  lea RA, [RA+NARGS:RC*8-16]		// RA = end of source for stack move.
+  |.endif
+  |  sub RA, PC				// Relative to PC.
+  |
+  |  cmp PC, RB
+  |  je >3
+  |2:  // Move args to coroutine.
+  |  mov RC, [PC+RA+4]
+  |  mov [PC-4], RC
+  |  mov RC, [PC+RA]
+  |  mov [PC-8], RC
+  |  sub PC, 8
+  |  cmp PC, RB
+  |  jne <2
+  |3:
+  |  xor RA, RA
+  |  mov ARG4, RA
+  |  mov ARG3, RA
+  |  call ->vm_resume			// (lua_State *L, StkId base, 0, 0)
+  |  set_vmstate INTERP
+  |
+  |  mov L:RB, SAVE_L
+  |  mov L:PC, ARG1			// The callee doesn't modify SAVE_L.
+  |  mov BASE, L:RB->base
+  |  cmp eax, LUA_YIELD
+  |  ja >8
+  |4:
+  |  mov RA, L:PC->base
+  |  mov KBASE, L:PC->top
+  |  mov L:PC->top, RA			// Clear coroutine stack.
+  |  mov PC, KBASE
+  |  sub PC, RA
+  |  je >6				// No results?
+  |  lea RD, [BASE+PC]
+  |  shr PC, 3
+  |  cmp RD, L:RB->maxstack
+  |  ja >9				// Need to grow stack?
+  |
+  |  mov RB, BASE
+  |  sub RB, RA
+  |5:  // Move results from coroutine.
+  |  mov RD, [RA]
+  |  mov [RA+RB], RD
+  |  mov RD, [RA+4]
+  |  mov [RA+RB+4], RD
+  |  add RA, 8
+  |  cmp RA, KBASE
+  |  jne <5
+  |6:
+  |.if resume
+  |  lea RD, [PC+2]			// nresults+1 = 1 + true + results.
+  |  mov dword [BASE-4], LJ_TTRUE	// Prepend true to results.
+  |.else
+  |  lea RD, [PC+1]			// nresults+1 = 1 + results.
+  |.endif
+  |7:
+  |  mov PC, SAVE_PC
+  |  mov NRESULTS, RD
+  |.if resume
+  |  mov RA, -8
+  |.else
+  |  xor RA, RA
+  |.endif
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z
+  |  jmp ->vm_return
+  |
+  |8:  // Coroutine returned with error (at co->top-1).
+  |.if resume
+  |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to results.
+  |  mov RA, L:PC->top
+  |  sub RA, 8
+  |  mov L:PC->top, RA			// Clear error from coroutine stack.
+  |  mov RD, [RA]			// Copy error message.
+  |  mov [BASE], RD
+  |  mov RD, [RA+4]
+  |  mov [BASE+4], RD
+  |  mov RD, 1+2			// nresults+1 = 1 + false + error.
+  |  jmp <7
+  |.else
+  |  mov ARG2, L:PC
+  |  mov ARG1, L:RB
+  |  call extern lj_ffh_coroutine_wrap_err  // (lua_State *L, lua_State *co)
+  |  // Error function does not return.
+  |.endif
+  |
+  |9:  // Handle stack expansion on return from yield.
+  |  mov L:RA, ARG1			// The callee doesn't modify SAVE_L.
+  |  mov L:RA->top, KBASE		// Undo coroutine stack clearing.
+  |  mov ARG2, PC
+  |  mov ARG1, L:RB
+  |  call extern lj_state_growstack	// (lua_State *L, int n)
+  |  mov BASE, L:RB->base
+  |  jmp <4				// Retry the stack move.
+  |.endmacro
+  |
+  |  coroutine_resume_wrap 1		// coroutine.resume
+  |  coroutine_resume_wrap 0		// coroutine.wrap
+  |
+  |.ffunc coroutine_yield
+  |  mov L:RB, SAVE_L
+  |  mov [RA-4], PC
+  |  test aword L:RB->cframe, CFRAME_CANYIELD
+  |  jz ->fff_fallback
+  |  mov L:RB->base, RA
+  |  lea RC, [RA+NARGS:RC*8-8]
+  |  mov L:RB->top, RC
+  |  xor eax, eax
+  |  mov aword L:RB->cframe, eax
+  |  mov al, LUA_YIELD
+  |  mov byte L:RB->status, al
+  |  jmp ->vm_leave_unw
+  |
+  |//-- Math library -------------------------------------------------------
+  |
+  |.ffunc_n math_abs
+  |  fabs
+  |  // fallthrough
+  |->fff_resn:
+  |  fstp qword [RA-8]
+  |->fff_res1:
+  |  mov RD, 1+1
+  |->fff_res:
+  |  mov NRESULTS, RD
+  |->fff_res_:
+  |  test PC, FRAME_TYPE
+  |  jnz >7
+  |5:
+  |  cmp PC_RB, RDL			// More results expected?
+  |  ja >6
+  |  // BASE and KBASE are assumed to be set for the calling frame.
+  |  ins_next
+  |
+  |6:  // Fill up results with nil.
+  |  mov dword [RA+RD*8-12], LJ_TNIL
+  |  add RD, 1
+  |  jmp <5
+  |
+  |7:  // Non-standard return case.
+  |  mov BASE, RA
+  |  mov RA, -8				// Results start at BASE+RA = BASE-8.
+  |  jmp ->vm_return
+  |
+  |.ffunc_n math_floor;	call ->vm_floor;	jmp ->fff_resn
+  |.ffunc_n math_ceil;	call ->vm_ceil;		jmp ->fff_resn
+  |
+  |.ffunc_n math_sqrt;	fsqrt;			jmp ->fff_resn
+  |
+  |.ffunc_n math_log, fldln2;	fyl2x;		jmp ->fff_resn
+  |.ffunc_n math_log10, fldlg2;	fyl2x;		jmp ->fff_resn
+  |.ffunc_n math_exp;	call ->vm_exp;		jmp ->fff_resn
+  |
+  |.ffunc_n math_sin;	fsin;			jmp ->fff_resn
+  |.ffunc_n math_cos;	fcos;			jmp ->fff_resn
+  |.ffunc_n math_tan;	fptan; fpop;		jmp ->fff_resn
+  |
+  |.ffunc_n math_asin
+  |  fdup; fmul st0; fld1; fsubrp st1; fsqrt; fpatan
+  |  jmp ->fff_resn
+  |.ffunc_n math_acos
+  |  fdup; fmul st0; fld1; fsubrp st1; fsqrt; fxch; fpatan
+  |  jmp ->fff_resn
+  |.ffunc_n math_atan;	fld1; fpatan;		jmp ->fff_resn
+  |
+  |.macro math_extern, func
+  |.ffunc_n math_ .. func
+  |  mov ARG5, RA
+  |  fstp FPARG1
+  |  mov RB, BASE
+  |  call extern func
+  |  mov RA, ARG5
+  |  mov BASE, RB
+  |  jmp ->fff_resn
+  |.endmacro
+  |
+  |  math_extern sinh
+  |  math_extern cosh
+  |  math_extern tanh
+  |
+  |->ff_math_deg:
+  |.ffunc_n math_rad;	fmul qword CFUNC:RB->upvalue[0];	jmp ->fff_resn
+  |
+  |.ffunc_nn math_atan2;	fpatan;		jmp ->fff_resn
+  |.ffunc_nnr math_ldexp;	fscale; fpop1;	jmp ->fff_resn
+  |
+  |.ffunc_1 math_frexp
+  |  mov RB, [RA+4]
+  |  cmp RB, LJ_TISNUM;  ja ->fff_fallback
+  |  mov RC, [RA]
+  |  mov [RA-4], RB; mov [RA-8], RC
+  |  shl RB, 1; cmp RB, 0xffe00000; jae >3
+  |  or RC, RB; jz >3
+  |  mov RC, 1022
+  |  cmp RB, 0x00200000; jb >4
+  |1:
+  |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
+  |  mov ARG1, RB; fild ARG1
+  |  mov RB, [RA-4]
+  |  and RB, 0x800fffff			// Mask off exponent.
+  |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
+  |  mov [RA-4], RB
+  |2:
+  |  fstp qword [RA]
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
+  |  fldz; jmp <2
+  |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
+  |  fld qword [RA]
+  |  mov ARG1, 0x5a800000; fmul ARG1	// x = x*2^54
+  |  fstp qword [RA-8]
+  |  mov RB, [RA-4]; mov RC, 1076; shl RB, 1; jmp <1
+  |
+  |.ffunc_n math_modf
+  |  mov RB, [RA+4]
+  |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
+  |  fdup
+  |  call ->vm_trunc
+  |  fsub st1, st0
+  |1:
+  |  fstp qword [RA-8]; fstp qword [RA]
+  |  mov RC, [RA-4]; mov RB, [RA+4]
+  |  xor RC, RB; js >3				// Need to adjust sign?
+  |2:
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |3:
+  |  xor RB, 0x80000000; mov [RA+4], RB; jmp <2	// Flip sign of fraction.
+  |4:
+  |  fldz; fxch; jmp <1				// Return +-Inf and +-0.
+  |
+  |.ffunc_nnr math_fmod
+  |1: ; fprem; fnstsw ax; sahf; jp <1
+  |  fpop1
+  |  jmp ->fff_resn
+  |
+  |.ffunc_nn math_pow;		call ->vm_pow;	jmp ->fff_resn
+  |
+  |.macro math_minmax, name, cmovop, nocmovop
+  |.ffunc_n name
+  |  mov RB, 2
+  |1:
+  |  cmp RB, RD;  jae ->fff_resn
+  |  cmp dword [RA+RB*8-4], LJ_TISNUM;  ja >5
+  |  fld qword [RA+RB*8-8]
+  ||if (cmov) {
+  |  fucomi st1; cmovop st1; fpop1
+  ||} else {
+  |  push eax
+  |  fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop
+  |  pop eax
+  ||}
+  |  add RB, 1
+  |  jmp <1
+  |.endmacro
+  |
+  |  math_minmax math_min, fcmovnbe, jz
+  |  math_minmax math_max, fcmovbe, jnz
+  |5:
+  |  fpop; jmp ->fff_fallback
+  |
+  |//-- String library -----------------------------------------------------
+  |
+  |.ffunc_1 string_len
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  fild dword STR:RB->len
+  |  jmp ->fff_resn
+  |
+  |.ffunc string_byte			// Only handle the 1-arg case here.
+  |  cmp NARGS:RC, 1+1;  jne ->fff_fallback
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  cmp dword STR:RB->len, 1
+  |  jb ->fff_res0			// Return no results for empty string.
+  |  movzx RB, byte STR:RB[1]
+  |  mov ARG1, RB
+  |  fild ARG1
+  |  jmp ->fff_resn
+  |
+  |.ffunc string_char			// Only handle the 1-arg case here.
+  |  ffgccheck
+  |  cmp NARGS:RC, 1+1;  jne ->fff_fallback	// *Exactly* 1 arg.
+  |  cmp dword [RA+4], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA]
+  |  fistp ARG4
+  |  cmp ARG4, 255;  ja ->fff_fallback
+  |  lea RC, ARG4			// Little-endian.
+  |  mov ARG5, RA			// Save RA.
+  |  mov ARG3, 1
+  |  mov ARG2, RC
+  |->fff_newstr:
+  |  mov L:RB, SAVE_L
+  |  mov ARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_str_new		// (lua_State *L, char *str, size_t l)
+  |  // GCstr * returned in eax (RC).
+  |  mov RA, ARG5
+  |  mov BASE, L:RB->base
+  |  mov dword [RA-4], LJ_TSTR
+  |  mov [RA-8], STR:RC
+  |  jmp ->fff_res1
+  |
+  |.ffunc string_sub
+  |  ffgccheck
+  |  mov ARG5, RA			// Save RA.
+  |  mov ARG4, -1
+  |  cmp NARGS:RC, 1+2;  jb ->fff_fallback
+  |  jna >1
+  |  cmp dword [RA+20], LJ_TISNUM;  ja ->fff_fallback
+  |  fld qword [RA+16]
+  |  fistp ARG4
+  |1:
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  cmp dword [RA+12], LJ_TISNUM;  ja ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  mov ARG2, STR:RB
+  |  mov RB, STR:RB->len
+  |  fld qword [RA+8]
+  |  fistp ARG3
+  |  mov RC, ARG4
+  |  cmp RB, RC				// len < end? (unsigned compare)
+  |  jb >5
+  |2:
+  |  mov RA, ARG3
+  |  test RA, RA			// start <= 0?
+  |  jle >7
+  |3:
+  |  mov STR:RB, ARG2
+  |  sub RC, RA				// start > end?
+  |  jl ->fff_emptystr
+  |  lea RB, [STR:RB+RA+#STR-1]
+  |  add RC, 1
+  |4:
+  |  mov ARG2, RB
+  |  mov ARG3, RC
+  |  jmp ->fff_newstr
+  |
+  |5:  // Negative end or overflow.
+  |  jl >6
+  |  lea RC, [RC+RB+1]			// end = end+(len+1)
+  |  jmp <2
+  |6:  // Overflow.
+  |  mov RC, RB				// end = len
+  |  jmp <2
+  |
+  |7:  // Negative start or underflow.
+  |  je >8
+  |  add RA, RB				// start = start+(len+1)
+  |  add RA, 1
+  |  jg <3				// start > 0?
+  |8:  // Underflow.
+  |  mov RA, 1				// start = 1
+  |  jmp <3
+  |
+  |->fff_emptystr:  // Range underflow.
+  |  xor RC, RC				// Zero length. Any ptr in RB is ok.
+  |  jmp <4
+  |
+  |.ffunc_2 string_rep			// Only handle the 1-char case inline.
+  |  ffgccheck
+  |  mov ARG5, RA			// Save RA.
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  cmp dword [RA+12], LJ_TISNUM;  ja ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  fld qword [RA+8]
+  |  fistp ARG4
+  |  mov RC, ARG4
+  |  test RC, RC
+  |  jle ->fff_emptystr			// Count <= 0? (or non-int)
+  |  cmp dword STR:RB->len, 1
+  |  jb ->fff_emptystr			// Zero length string?
+  |  jne ->fff_fallback_2		// Fallback for > 1-char strings.
+  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_2
+  |  movzx RA, byte STR:RB[1]
+  |  mov RB, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
+  |  mov ARG3, RC
+  |  mov ARG2, RB
+  |1:  // Fill buffer with char. Yes, this is suboptimal code (do you care?).
+  |  mov [RB], RAL
+  |  add RB, 1
+  |  sub RC, 1
+  |  jnz <1
+  |  jmp ->fff_newstr
+  |
+  |.ffunc_1 string_reverse
+  |  ffgccheck
+  |  mov ARG5, RA			// Save RA.
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  mov RC, STR:RB->len
+  |  test RC, RC
+  |  jz ->fff_emptystr			// Zero length string?
+  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_1
+  |  add RB, #STR
+  |  mov ARG4, PC			// Need another temp register.
+  |  mov ARG3, RC
+  |  mov PC, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
+  |  mov ARG2, PC
+  |1:
+  |  movzx RA, byte [RB]
+  |  add RB, 1
+  |  sub RC, 1
+  |  mov [PC+RC], RAL
+  |  jnz <1
+  |  mov PC, ARG4
+  |  jmp ->fff_newstr
+  |
+  |.macro ffstring_case, name, lo, hi
+  |  .ffunc_1 name
+  |  ffgccheck
+  |  mov ARG5, RA			// Save RA.
+  |  cmp dword [RA+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov STR:RB, [RA]
+  |  mov RC, STR:RB->len
+  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_1
+  |  add RB, #STR
+  |  mov ARG4, PC			// Need another temp register.
+  |  mov ARG3, RC
+  |  mov PC, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
+  |  mov ARG2, PC
+  |  jmp >3
+  |1:  // ASCII case conversion. Yes, this is suboptimal code (do you care?).
+  |  movzx RA, byte [RB+RC]
+  |  cmp RA, lo
+  |  jb >2
+  |  cmp RA, hi
+  |  ja >2
+  |  xor RA, 0x20
+  |2:
+  |  mov [PC+RC], RAL
+  |3:
+  |  sub RC, 1
+  |  jns <1
+  |  mov PC, ARG4
+  |  jmp ->fff_newstr
+  |.endmacro
+  |
+  |ffstring_case string_lower, 0x41, 0x5a
+  |ffstring_case string_upper, 0x61, 0x7a
+  |
+  |//-- Table library ------------------------------------------------------
+  |
+  |.ffunc_1 table_getn
+  |  cmp dword [RA+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov TAB:RB, [RA]
+  |  mov ARG1, TAB:RB
+  |  mov RB, RA				// Save RA and BASE.
+  |  mov ARG2, BASE
+  |  call extern lj_tab_len		// (GCtab *t)
+  |  // Length of table returned in eax (RC).
+  |  mov ARG1, RC
+  |  mov RA, RB				// Restore RA and BASE.
+  |  mov BASE, ARG2
+  |  fild ARG1
+  |  jmp ->fff_resn
+  |
+  |//-- Bit library --------------------------------------------------------
+  |
+  |.define TOBIT_BIAS, 0x59c00000	// 2^52 + 2^51 (float, not double!).
+  |
+  |.ffunc_n bit_tobit
+  |  mov ARG5, TOBIT_BIAS
+  |  fadd ARG5
+  |  fstp FPARG1			// 64 bit FP store.
+  |  fild ARG1				// 32 bit integer load (s2lfwd ok).
+  |  jmp ->fff_resn
+  |
+  |.macro .ffunc_bit, name
+  |  .ffunc_n name
+  |  mov ARG5, TOBIT_BIAS
+  |  fadd ARG5
+  |  fstp FPARG1
+  |  mov RB, ARG1
+  |.endmacro
+  |
+  |.macro .ffunc_bit_op, name, ins
+  |  .ffunc_bit name
+  |  mov NRESULTS, NARGS:RC		// Save for fallback.
+  |  lea RC, [RA+NARGS:RC*8-16]
+  |1:
+  |  cmp RC, RA
+  |  jbe ->fff_resbit
+  |  cmp dword [RC+4], LJ_TISNUM;  ja ->fff_fallback_bit_op
+  |  fld qword [RC]
+  |  fadd ARG5
+  |  fstp FPARG1
+  |  ins RB, ARG1
+  |  sub RC, 8
+  |  jmp <1
+  |.endmacro
+  |
+  |.ffunc_bit_op bit_band, and
+  |.ffunc_bit_op bit_bor, or
+  |.ffunc_bit_op bit_bxor, xor
+  |
+  |.ffunc_bit bit_bswap
+  |  bswap RB
+  |  jmp ->fff_resbit
+  |
+  |.ffunc_bit bit_bnot
+  |  not RB
+  |->fff_resbit:
+  |  mov ARG1, RB
+  |  fild ARG1
+  |  jmp ->fff_resn
+  |
+  |->fff_fallback_bit_op:
+  |  mov NARGS:RC, NRESULTS		// Restore for fallback
+  |  jmp ->fff_fallback
+  |
+  |.macro .ffunc_bit_sh, name, ins
+  |  .ffunc_nn name
+  |  mov ARG5, TOBIT_BIAS
+  |  fadd ARG5
+  |  fstp FPARG3
+  |  fadd ARG5
+  |  fstp FPARG1
+  |  mov RC, RA				// Assumes RA is ecx.
+  |  mov RA, ARG3
+  |  mov RB, ARG1
+  |  ins RB, cl
+  |  mov RA, RC
+  |  jmp ->fff_resbit
+  |.endmacro
+  |
+  |.ffunc_bit_sh bit_lshift, shl
+  |.ffunc_bit_sh bit_rshift, shr
+  |.ffunc_bit_sh bit_arshift, sar
+  |.ffunc_bit_sh bit_rol, rol
+  |.ffunc_bit_sh bit_ror, ror
+  |
+  |//-----------------------------------------------------------------------
+  |
+  |->fff_fallback_2:
+  |  mov NARGS:RC, 1+2			// Other args are ignored, anyway.
+  |  jmp ->fff_fallback
+  |->fff_fallback_1:
+  |  mov NARGS:RC, 1+1			// Other args are ignored, anyway.
+  |->fff_fallback:			// Call fast function fallback handler.
+  |  // RA = new base, RC = nargs+1
+  |  mov L:RB, SAVE_L
+  |  sub BASE, RA
+  |  mov [RA-4], PC
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov ARG3, BASE			// Save old BASE (relative).
+  |  mov L:RB->base, RA
+  |  lea RC, [RA+NARGS:RC*8-8]
+  |  mov ARG1, L:RB
+  |  lea BASE, [RC+8*LUA_MINSTACK]	// Ensure enough space for handler.
+  |  mov L:RB->top, RC
+  |  mov CFUNC:RA, [RA-8]
+  |  cmp BASE, L:RB->maxstack
+  |  ja >5				// Need to grow stack.
+  |  call aword CFUNC:RA->f		// (lua_State *L)
+  |  // Either throws an error or recovers and returns 0 or NRESULTS (+1).
+  |  test RC, RC;  jnz >3
+  |1:  // Returned 0: retry fast path.
+  |  mov RA, L:RB->base
+  |  mov RC, L:RB->top
+  |  sub RC, RA
+  |  shr RC, 3
+  |  add NARGS:RC, 1
+  |  mov LFUNC:RB, [RA-8]
+  |  mov BASE, ARG3			// Restore old BASE.
+  |  add BASE, RA
+  |  cmp [RA-4], PC;  jne >2		// Callable modified by handler?
+  |  jmp aword LFUNC:RB->gate		// Retry the call.
+  |
+  |2:  // Run modified callable.
+  |  cmp dword [RA-4], LJ_TFUNC
+  |  jne ->vmeta_call
+  |  jmp aword LFUNC:RB->gate		// Retry the call.
+  |
+  |3:  // Returned NRESULTS (already in RC/RD).
+  |  mov RA, L:RB->base
+  |  mov BASE, ARG3			// Restore old BASE.
+  |  add BASE, RA
+  |  jmp ->fff_res
+  |
+  |5:  // Grow stack for fallback handler.
+  |  mov ARG2, LUA_MINSTACK
+  |  call extern lj_state_growstack	// (lua_State *L, int n)
+  |  jmp <1				// Dumb retry (goes through ff first).
+  |
+  |->fff_gcstep:			// Call GC step function.
+  |  // RA = new base, RC = nargs+1
+  |  pop RB				// Must keep stack at same level.
+  |  mov ARG3, RB			// Save return address
+  |  mov L:RB, SAVE_L
+  |  sub BASE, RA
+  |  mov ARG2, BASE			// Save old BASE (relative).
+  |  mov [RA-4], PC
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov L:RB->base, RA
+  |  lea RC, [RA+NARGS:RC*8-8]
+  |  mov ARG1, L:RB
+  |  mov L:RB->top, RC
+  |  call extern lj_gc_step		// (lua_State *L)
+  |  mov RA, L:RB->base
+  |  mov RC, L:RB->top
+  |  sub RC, RA
+  |  shr RC, 3
+  |  add NARGS:RC, 1
+  |  mov PC, [RA-4]
+  |  mov BASE, ARG2			// Restore old BASE.
+  |  add BASE, RA
+  |  mov RB, ARG3
+  |  push RB				// Restore return address.
+  |  mov LFUNC:RB, [RA-8]
+  |  ret
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Special dispatch targets -------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_record:				// Dispatch target for recording phase.
+#if LJ_HASJIT
+  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  test RDL, HOOK_VMEVENT		// No recording while in vmevent.
+  |  jnz >5
+  |  // Decrement the hookcount for consistency, but always do the call.
+  |  test RDL, HOOK_ACTIVE
+  |  jnz >1
+  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+  |  jz >1
+  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+  |  jmp >1
+#endif
+  |
+  |->vm_hook:				// Dispatch target with enabled hooks.
+  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  test RDL, HOOK_ACTIVE		// Hook already active?
+  |  jnz >5
+  |
+  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+  |  jz >5
+  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+  |  jz >1
+  |  test RDL, LUA_MASKLINE
+  |  jz >5
+  |1:
+  |  mov L:RB, SAVE_L
+  |  mov RD, NRESULTS			// Dynamic top for *M instructions.
+  |  mov ARG3, RD
+  |  mov L:RB->base, BASE
+  |  mov ARG2, PC
+  |  mov ARG1, L:RB
+  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
+  |  call extern lj_dispatch_ins  // (lua_State *L, BCIns *pc, int nres)
+  |4:
+  |  mov BASE, L:RB->base
+  |  movzx RA, PC_RA
+  |5:
+  |  movzx OP, PC_OP
+  |  movzx RD, PC_RD
+  |  jmp aword [DISPATCH+OP*4+GG_DISP_STATIC*4]	// Re-dispatch to static ins.
+  |
+  |->vm_hotloop:			// Hot loop counter underflow.
+#if LJ_HASJIT
+  |  mov L:RB, SAVE_L
+  |  lea RA, [DISPATCH+GG_DISP2J]
+  |  mov ARG2, PC
+  |  mov ARG1, RA
+  |  mov [DISPATCH+DISPATCH_J(L)], L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
+  |  jmp <4
+#endif
+  |
+  |->vm_hotcall:			// Hot call counter underflow.
+#if LJ_HASJIT
+  |  mov L:RB, SAVE_L
+  |  lea RA, [DISPATCH+GG_DISP2J]
+  |  mov ARG2, PC
+  |  mov ARG1, RA
+  |  mov [DISPATCH+DISPATCH_J(L)], L:RB
+  |  mov SAVE_PC, PC
+  |  mov L:RB->base, BASE
+  |  call extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
+  |  mov BASE, L:RB->base
+  |  // Dispatch the first instruction and optionally record it.
+  |  ins_next
+#endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Trace exit handler -------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// Called from an exit stub with the exit number on the stack.
+  |// The 16 bit exit number is stored with two (sign-extended) push imm8.
+  |->vm_exit_handler:
+#if LJ_HASJIT
+  |  push ebp; lea ebp, [esp+12]; push ebp
+  |  push ebx; push edx; push ecx; push eax
+  |  movzx RC, byte [ebp-4]		// Reconstruct exit number.
+  |  mov RCH, byte [ebp-8]
+  |  mov [ebp-4], edi; mov [ebp-8], esi
+  |  // Caveat: DISPATCH is ebx.
+  |  mov DISPATCH, [ebp]
+  |  mov RA, [DISPATCH+DISPATCH_GL(vmstate)]	// Get trace number.
+  |  set_vmstate EXIT
+  |  mov [DISPATCH+DISPATCH_J(exitno)], RC
+  |  mov [DISPATCH+DISPATCH_J(parent)], RA
+  |  sub esp, 8*8+16			// Room for SSE regs + args.
+  |
+  |  // Must not access SSE regs if SSE2 is not present.
+  |  test dword [DISPATCH+DISPATCH_J(flags)], JIT_F_SSE2
+  |  jz >1
+  |  movsd qword [ebp-40], xmm7; movsd qword [ebp-48], xmm6
+  |  movsd qword [ebp-56], xmm5; movsd qword [ebp-64], xmm4
+  |  movsd qword [ebp-72], xmm3; movsd qword [ebp-80], xmm2
+  |  movsd qword [ebp-88], xmm1; movsd qword [ebp-96], xmm0
+  |1:
+  |  // Caveat: RB is ebp.
+  |  mov L:RB, [DISPATCH+DISPATCH_GL(jit_L)]
+  |  mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
+  |  mov [DISPATCH+DISPATCH_J(L)], L:RB
+  |  lea RC, [esp+16]
+  |  mov L:RB->base, BASE
+  |  lea RA, [DISPATCH+GG_DISP2J]
+  |  mov ARG2, RC
+  |  mov ARG1, RA
+  |  call extern lj_trace_exit		// (jit_State *J, ExitState *ex)
+  |  // Interpreter C frame returned in eax.
+  |  mov esp, eax			// Reposition stack to C frame.
+  |  mov BASE, L:RB->base
+  |  mov PC, SAVE_PC
+  |  mov SAVE_L, L:RB			// Needed for on-trace resume/yield.
+#endif
+  |->vm_exit_interp:
+#if LJ_HASJIT
+  |  mov LFUNC:KBASE, [BASE-8]
+  |  mov PROTO:KBASE, LFUNC:KBASE->pt
+  |  mov KBASE, PROTO:KBASE->k
+  |  mov dword [DISPATCH+DISPATCH_GL(jit_L)], 0
+  |  set_vmstate INTERP
+  |  ins_next
+#endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Math helper functions ----------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// FP value rounding. Called by math.floor/math.ceil fast functions
+  |// and from JIT code. Arg/ret on x87 stack. No int/xmm registers modified.
+  |.macro vm_round, mode1, mode2
+  |  fnstcw word [esp+4]		// Caveat: overwrites ARG1 and ARG2.
+  |  mov [esp+8], eax
+  |  mov ax, mode1
+  |  or ax, [esp+4]
+  |.if mode2 ~= 0xffff
+  |  and ax, mode2
+  |.endif
+  |  mov [esp+6], ax
+  |  fldcw word [esp+6]
+  |  frndint
+  |  fldcw word [esp+4]
+  |  mov eax, [esp+8]
+  |  ret
+  |.endmacro
+  |
+  |->vm_floor:
+  |  vm_round 0x0400, 0xf7ff
+  |
+  |->vm_ceil:
+  |  vm_round 0x0800, 0xfbff
+  |
+  |->vm_trunc:
+  |  vm_round 0x0c00, 0xffff
+  |
+  |// FP modulo x%y. Called by BC_MOD* and vm_arith.
+  |// Args/ret on x87 stack (y on top). No xmm registers modified.
+  |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
+  |->vm_mod:
+  |  fld st1
+  |  fdiv st1
+  |  fnstcw word [esp+4]
+  |  mov ax, 0x0400
+  |  or ax, [esp+4]
+  |  and ax, 0xf7ff
+  |  mov [esp+6], ax
+  |  fldcw word [esp+6]
+  |  frndint
+  |  fldcw word [esp+4]
+  |  fmulp st1
+  |  fsubp st1
+  |  ret
+  |
+  |// FP exponentiation e^x and 2^x. Called by math.exp fast function and
+  |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified.
+  |// Caveat: needs 3 slots on x87 stack!
+  |->vm_exp:
+  |  fldl2e; fmulp st1				// e^x ==> 2^(x*log2(e))
+  |->vm_exp2:
+  |  fst dword [esp+4]				// Caveat: overwrites ARG1.
+  |  cmp dword [esp+4], 0x7f800000; je >1	// Special case: e^+Inf = +Inf
+  |  cmp dword [esp+4], 0xff800000; je >2	// Special case: e^-Inf = 0
+  |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
+  |  fdup; frndint; fsub st1, st0; fxch	// Split into frac/int part.
+  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
+  |1:
+  |  ret
+  |2:
+  |  fpop; fldz; ret
+  |
+  |// Generic power function x^y. Called by BC_POW, math.pow fast function
+  |// and vm_arith. Args/ret on x87 stack (y on top). No int/xmm regs modified.
+  |// Caveat: needs 3 slots on x87 stack!
+  |->vm_pow:
+  |  fist dword [esp+4]			// Store/reload int before comparison.
+  |  fild dword [esp+4]			// Integral exponent used in vm_powi.
+  ||if (cmov) {
+  |  fucomip st1
+  ||} else {
+  |  push eax; fucomp st1; fnstsw ax; sahf; pop eax
+  ||}
+  |  jnz >8				// Branch for FP exponents.
+  |  jp >9				// Branch for NaN exponent.
+  |  fpop				// Pop y and fallthrough to vm_powi.
+  |
+  |// FP/int power function x^i. Called from JIT code. Arg1/ret on x87 stack.
+  |// Arg2 (int) on C stack. No int/xmm regs modified.
+  |// Caveat: needs 2 slots on x87 stack!
+  |->vm_powi:
+  |  push eax
+  |  mov eax, [esp+8]
+  |  cmp eax, 1; jle >6			// i<=1?
+  |  // Now 1 < (unsigned)i <= 0x80000000.
+  |1:  // Handle leading zeros.
+  |  test eax, 1; jnz >2
+  |  fmul st0
+  |  shr eax, 1
+  |  jmp <1
+  |2:
+  |  shr eax, 1; jz >5
+  |  fdup
+  |3:  // Handle trailing bits.
+  |  fmul st0
+  |  shr eax, 1; jz >4
+  |  jnc <3
+  |  fmul st1, st0
+  |  jmp <3
+  |4:
+  |  fmulp st1
+  |5:
+  |  pop eax
+  |  ret
+  |6:
+  |  je <5				// x^1 ==> x
+  |  jb >7
+  |  fld1; fdivrp st1
+  |  neg eax
+  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
+  |  jmp <1				// x^-i ==> (1/x)^i
+  |7:
+  |  fpop; fld1				// x^0 ==> 1
+  |  pop eax
+  |  ret
+  |
+  |8:  // FP/FP power function x^y.
+  |  push eax
+  |  fst dword [esp+8]
+  |  fxch
+  |  fst dword [esp+12]
+  |  mov eax, [esp+8]; shl eax, 1
+  |  cmp eax, 0xff000000; je >2			// x^+-Inf?
+  |  mov eax, [esp+12]; shl eax, 1; je >4	// +-0^y?
+  |  cmp eax, 0xff000000; je >4			// +-Inf^y?
+  |  pop eax
+  |  fyl2x
+  |  jmp ->vm_exp2raw
+  |
+  |9:  // Handle x^NaN.
+  |  fld1
+  ||if (cmov) {
+  |  fucomip st2
+  ||} else {
+  |  push eax; fucomp st2; fnstsw ax; sahf; pop eax
+  ||}
+  |  je >1				// 1^NaN ==> 1
+  |  fxch				// x^NaN ==> NaN
+  |1:
+  |  fpop
+  |  ret
+  |
+  |2:  // Handle x^+-Inf.
+  |  fabs
+  |  fld1
+  ||if (cmov) {
+  |  fucomip st1
+  ||} else {
+  |  fucomp st1; fnstsw ax; sahf
+  ||}
+  |  je >3					// +-1^+-Inf ==> 1
+  |  fpop; fabs; fldz; mov eax, 0; setc al
+  |  ror eax, 1; xor eax, [esp+8]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
+  |  fxch
+  |3:
+  |  fpop1; fabs; pop eax
+  |  ret
+  |
+  |4:  // Handle +-0^y or +-Inf^y.
+  |  cmp dword [esp+8], 0; jge <3		// y >= 0, x^y ==> |x|
+  |  fpop; fpop
+  |  test eax, eax; pop eax; jz >5		// y < 0, +-0^y ==> +Inf
+  |  fldz					// y < 0, +-Inf^y ==> 0
+  |  ret
+  |5:
+  |  mov dword [esp+8], 0x7f800000		// Return +Inf.
+  |  fld dword [esp+8]
+  |  ret
+  |
+  |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
+  |// Computes fpm(x) for extended math functions. ORDER FPM.
+  |->vm_foldfpm:
+  |  mov eax, [esp+12]
+  |  fld qword [esp+4]
+  |  cmp eax, 1; jb ->vm_floor; je ->vm_ceil
+  |  cmp eax, 3; jb ->vm_trunc; ja >1
+  |  fsqrt; ret
+  |1: ; cmp eax, 5; jb ->vm_exp; je ->vm_exp2
+  |  cmp eax, 7; je >1; ja >2
+  |  fldln2; fxch; fyl2x; ret
+  |1: ; fld1; fxch; fyl2x; ret
+  |2: ; cmp eax, 9; je >1; ja >2
+  |  fldlg2; fxch; fyl2x; ret
+  |1: ; fsin; ret
+  |2: ; cmp eax, 11; je >1; ja >9
+  |   fcos; ret
+  |1: ; fptan; fpop; ret
+  |9: ; int3					// Bad fpm.
+  |
+  |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
+  |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -)
+  |// and basic math functions. ORDER ARITH
+  |->vm_foldarith:
+  |  mov eax, [esp+20]
+  |  fld qword [esp+4]
+  |  fld qword [esp+12]
+  |  cmp eax, 1; je >1; ja >2
+  |  faddp st1; ret
+  |1: ; fsubp st1; ret
+  |2: ; cmp eax, 3; je >1; ja >2
+  |  fmulp st1; ret
+  |1: ; fdivp st1; ret
+  |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
+  |  cmp eax, 7; je >1; ja >2
+  |  fpop; fchs; ret
+  |1: ; fpop; fabs; ret
+  |2: ; cmp eax, 9; je >1; ja >2
+  |  fpatan; ret
+  |1: ; fxch; fscale; fpop1; ret
+  |2: ; cmp eax, 11; je >1; ja >9
+  ||if (cmov) {
+  |  fucomi st1; fcmovnbe st1; fpop1; ret
+  |1: ; fucomi st1; fcmovbe st1; fpop1; ret
+  ||} else {
+  |  fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret
+  |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret
+  ||}
+  |9: ; int3					// Bad op.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Miscellaneous functions --------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
+  |->vm_cpuid:
+  |  pushfd
+  |  pop edx
+  |  mov ecx, edx
+  |  xor edx, 0x00200000		// Toggle ID bit in flags.
+  |  push edx
+  |  popfd
+  |  pushfd
+  |  pop edx
+  |  xor eax, eax			// Zero means no features supported.
+  |  cmp ecx, edx
+  |  jz >1				// No ID toggle means no CPUID support.
+  |  mov eax, [esp+4]			// Argument 1 is function number.
+  |  push edi
+  |  push ebx
+  |  cpuid
+  |  mov edi, [esp+16]			// Argument 2 is result area.
+  |  mov [edi], eax
+  |  mov [edi+4], ebx
+  |  mov [edi+8], ecx
+  |  mov [edi+12], edx
+  |  pop ebx
+  |  pop edi
+  |1:
+  |  ret
+  |
+  |//-----------------------------------------------------------------------
+}
+
+/* Generate the code for a single instruction. */
+static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
+{
+  int vk = 0;
+  |// Note: aligning all instructions does not pay off.
+  |=>defop:
+
+  switch (op) {
+
+  /* -- Comparison ops ---------------------------------------------------- */
+
+  /* Remember: all ops branch for a true comparison, fall through otherwise. */
+
+  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
+    |  // RA = src1, RD = src2, JMP with RD = target
+    |  ins_AD
+    |  checknum RA, ->vmeta_comp
+    |  checknum RD, ->vmeta_comp
+    |  fld qword [BASE+RA*8]		// Reverse order, i.e like cmp D, A.
+    |  fld qword [BASE+RD*8]
+    |  add PC, 4
+    |  fcomparepp			// eax (RD) modified!
+    |  // Unordered: all of ZF CF PF set, ordered: PF clear.
+    |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
+    switch (op) {
+    case BC_ISLT:
+      |  jbe >2
+      break;
+    case BC_ISGE:
+      |  ja >2
+      break;
+    case BC_ISLE:
+      |  jb >2
+      break;
+    case BC_ISGT:
+      |  jae >2
+      break;
+    default: break;  /* Shut up GCC. */
+    }
+    |1:
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |2:
+    |  ins_next
+    break;
+
+  case BC_ISEQV: case BC_ISNEV:
+    vk = op == BC_ISEQV;
+    |  ins_AD	// RA = src1, RD = src2, JMP with RD = target
+    |  mov RB, [BASE+RD*8+4]
+    |  add PC, 4
+    |  cmp RB, LJ_TISNUM; ja >5
+    |  checknum RA, >5
+    |  fld qword [BASE+RA*8]
+    |  fld qword [BASE+RD*8]
+    |  fcomparepp			// eax (RD) modified!
+  iseqne_fp:
+    if (vk) {
+      |  jp >2				// Unordered means not equal.
+      |  jne >2
+    } else {
+      |  jp >2				// Unordered means not equal.
+      |  je >1
+    }
+  iseqne_end:
+    if (vk) {
+      |1:				// EQ: Branch to the target.
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |2:				// NE: Fallthrough to next instruction.
+    } else {
+      |2:				// NE: Branch to the target.
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |1:				// EQ: Fallthrough to next instruction.
+    }
+    |  ins_next
+    |
+    if (op == BC_ISEQV || op == BC_ISNEV) {
+      |5:  // Either or both types are not numbers.
+      |  checktp RA, RB			// Compare types.
+      |  jne <2				// Not the same type?
+      |  cmp RB, LJ_TISPRI
+      |  jae <1				// Same type and primitive type?
+      |
+      |  // Same types and not a primitive type. Compare GCobj or pvalue.
+      |  mov RA, [BASE+RA*8]
+      |  mov RD, [BASE+RD*8]
+      |  cmp RA, RD
+      |  je <1				// Same GCobjs or pvalues?
+      |  cmp RB, LJ_TISTABUD
+      |  ja <2				// Different objects and not table/ud?
+      |
+      |  // Different tables or userdatas. Need to check __eq metamethod.
+      |  // Field metatable must be at same offset for GCtab and GCudata!
+      |  mov TAB:RB, TAB:RA->metatable
+      |  test TAB:RB, TAB:RB
+      |  jz <2				// No metatable?
+      |  test byte TAB:RB->nomm, 1<<MM_eq
+      |  jnz <2				// Or 'no __eq' flag set?
+      if (vk) {
+	|  xor RB, RB			// ne = 0
+      } else {
+	|  mov RB, 1			// ne = 1
+      }
+      |  jmp ->vmeta_equal		// Handle __eq metamethod.
+    }
+    break;
+  case BC_ISEQS: case BC_ISNES:
+    vk = op == BC_ISEQS;
+    |  ins_AND	// RA = src, RD = str const, JMP with RD = target
+    |  add PC, 4
+    |  checkstr RA, >2
+    |  mov RA, [BASE+RA*8]
+    |  cmp RA, [KBASE+RD*4]
+  iseqne_test:
+    if (vk) {
+      |  jne >2
+    } else {
+      |  je >1
+    }
+    goto iseqne_end;
+  case BC_ISEQN: case BC_ISNEN:
+    vk = op == BC_ISEQN;
+    |  ins_AD	// RA = src, RD = num const, JMP with RD = target
+    |  add PC, 4
+    |  checknum RA, >2
+    |  fld qword [BASE+RA*8]
+    |  fld qword [KBASE+RD*8]
+    |  fcomparepp			// eax (RD) modified!
+    goto iseqne_fp;
+  case BC_ISEQP: case BC_ISNEP:
+    vk = op == BC_ISEQP;
+    |  ins_AND	// RA = src, RD = primitive type (~), JMP with RD = target
+    |  add PC, 4
+    |  checktp RA, RD
+    goto iseqne_test;
+
+  /* -- Unary test and copy ops ------------------------------------------- */
+
+  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
+    |  ins_AD	// RA = dst or unused, RD = src, JMP with RD = target
+    |  mov RB, [BASE+RD*8+4]
+    |  add PC, 4
+    |  cmp RB, LJ_TISTRUECOND
+    if (op == BC_IST || op == BC_ISTC) {
+      |  jae >1
+    } else {
+      |  jb >1
+    }
+    if (op == BC_ISTC || op == BC_ISFC) {
+      |  mov [BASE+RA*8+4], RB
+      |  mov RB, [BASE+RD*8]
+      |  mov [BASE+RA*8], RB
+    }
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |1:					// Fallthrough to the next instruction.
+    |  ins_next
+    break;
+
+  /* -- Unary ops --------------------------------------------------------- */
+
+  case BC_MOV:
+    |  ins_AD	// RA = dst, RD = src
+    |  mov RB, [BASE+RD*8+4]
+    |  mov RD, [BASE+RD*8]		// Overwrites RD.
+    |  mov [BASE+RA*8+4], RB
+    |  mov [BASE+RA*8], RD
+    |  ins_next_
+    break;
+  case BC_NOT:
+    |  ins_AD	// RA = dst, RD = src
+    |  xor RB, RB
+    |  checktp RD, LJ_TISTRUECOND
+    |  adc RB, LJ_TTRUE
+    |  mov [BASE+RA*8+4], RB
+    |  ins_next
+    break;
+  case BC_UNM:
+    |  ins_AD	// RA = dst, RD = src
+    |  checknum RD, ->vmeta_unm
+    |  fld qword [BASE+RD*8]
+    |  fchs
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    break;
+  case BC_LEN:
+    |  ins_AD	// RA = dst, RD = src
+    |  checkstr RD, >2
+    |  mov STR:RD, [BASE+RD*8]
+    |  fild dword STR:RD->len
+    |1:
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    |2:
+    |  checktab RD, ->vmeta_len
+    |  mov TAB:RD, [BASE+RD*8]
+    |  mov ARG1, TAB:RD
+    |  mov RB, BASE			// Save BASE.
+    |  call extern lj_tab_len		// (GCtab *t)
+    |  // Length of table returned in eax (RC).
+    |  mov ARG1, RC
+    |  mov BASE, RB			// Restore BASE.
+    |  fild ARG1
+    |  movzx RA, PC_RA
+    |  jmp <1
+    break;
+
+  /* -- Binary ops -------------------------------------------------------- */
+
+    |.macro ins_arithpre, ins
+    |  ins_ABC
+    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+    ||switch (vk) {
+    ||case 0:
+    |   checknum RB, ->vmeta_arith_vn
+    |   fld qword [BASE+RB*8]
+    |   ins qword [KBASE+RC*8]
+    ||  break;
+    ||case 1:
+    |   checknum RB, ->vmeta_arith_nv
+    |   fld qword [KBASE+RC*8]
+    |   ins qword [BASE+RB*8]
+    ||  break;
+    ||default:
+    |   checknum RB, ->vmeta_arith_vv
+    |   checknum RC, ->vmeta_arith_vv
+    |   fld qword [BASE+RB*8]
+    |   ins qword [BASE+RC*8]
+    ||  break;
+    ||}
+    |.endmacro
+    |
+    |.macro ins_arith, ins
+    |  ins_arithpre ins
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    |.endmacro
+
+    |  // RA = dst, RB = src1 or num const, RC = src2 or num const
+  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
+    |  ins_arith fadd
+    break;
+  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
+    |  ins_arith fsub
+    break;
+  case BC_MULVN: case BC_MULNV: case BC_MULVV:
+    |  ins_arith fmul
+    break;
+  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
+    |  ins_arith fdiv
+    break;
+  case BC_MODVN:
+    |  ins_arithpre fld
+    |->BC_MODVN_Z:
+    |  call ->vm_mod
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    break;
+  case BC_MODNV: case BC_MODVV:
+    |  ins_arithpre fld
+    |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
+    break;
+  case BC_POW:
+    |  ins_arithpre fld
+    |  call ->vm_pow
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    break;
+
+  case BC_CAT:
+    |  ins_ABC	// RA = dst, RB = src_start, RC = src_end
+    |  lea RA, [BASE+RC*8]
+    |  sub RC, RB
+    |  mov ARG2, RA
+    |  mov ARG3, RC
+    |->BC_CAT_Z:
+    |  mov L:RB, SAVE_L
+    |  mov ARG1, L:RB
+    |  mov SAVE_PC, PC
+    |  mov L:RB->base, BASE
+    |  call extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
+    |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  test RC, RC
+    |  jnz ->vmeta_binop
+    |  movzx RB, PC_RB			// Copy result to Stk[RA] from Stk[RB].
+    |  movzx RA, PC_RA
+    |  mov RC, [BASE+RB*8+4]
+    |  mov RB, [BASE+RB*8]
+    |  mov [BASE+RA*8+4], RC
+    |  mov [BASE+RA*8], RB
+    |  ins_next
+    break;
+
+  /* -- Constant ops ------------------------------------------------------ */
+
+  case BC_KSTR:
+    |  ins_AND	// RA = dst, RD = str const (~)
+    |  mov RD, [KBASE+RD*4]
+    |  mov dword [BASE+RA*8+4], LJ_TSTR
+    |  mov [BASE+RA*8], RD
+    |  ins_next
+    break;
+  case BC_KSHORT:
+    |  ins_AD	// RA = dst, RD = signed int16 literal
+    |  fild PC_RD			// Refetch signed RD from instruction.
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    break;
+  case BC_KNUM:
+    |  ins_AD	// RA = dst, RD = num const
+    |  fld qword [KBASE+RD*8]
+    |  fstp qword [BASE+RA*8]
+    |  ins_next
+    break;
+  case BC_KPRI:
+    |  ins_AND	// RA = dst, RD = primitive type (~)
+    |  mov [BASE+RA*8+4], RD
+    |  ins_next
+    break;
+  case BC_KNIL:
+    |  ins_AD	// RA = dst_start, RD = dst_end
+    |  lea RA, [BASE+RA*8+12]
+    |  lea RD, [BASE+RD*8+4]
+    |  mov RB, LJ_TNIL
+    |  mov [RA-8], RB			// Sets minimum 2 slots.
+    |1:
+    |  mov [RA], RB
+    |  add RA, 8
+    |  cmp RA, RD
+    |  jbe <1
+    |  ins_next
+    break;
+
+  /* -- Upvalue and function ops ------------------------------------------ */
+
+  case BC_UGET:
+    |  ins_AD	// RA = dst, RD = upvalue #
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RD*4+offsetof(GCfuncL, uvptr)]
+    |  mov RB, UPVAL:RB->v
+    |  mov RD, [RB+4]
+    |  mov RB, [RB]
+    |  mov [BASE+RA*8+4], RD
+    |  mov [BASE+RA*8], RB
+    |  ins_next
+    break;
+  case BC_USETV:
+    |  ins_AD	// RA = upvalue #, RD = src
+    |  // Really ugly code due to the lack of a 4th free register.
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  test byte UPVAL:RB->marked, LJ_GC_BLACK		// isblack(uv)
+    |  jnz >4
+    |1:
+    |  mov RA, [BASE+RD*8]
+    |2:
+    |  mov RB, UPVAL:RB->v
+    |  mov RD, [BASE+RD*8+4]
+    |  mov [RB], RA
+    |  mov [RB+4], RD
+    |3:
+    |  ins_next
+    |
+    |4:  // Upvalue is black. Check if new value is collectable and white.
+    |  mov RA, [BASE+RD*8+4]
+    |  sub RA, LJ_TISGCV
+    |  cmp RA, LJ_TISNUM - LJ_TISGCV			// tvisgcv(v)
+    |  jbe <1
+    |  mov GCOBJ:RA, [BASE+RD*8]
+    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(v)
+    |  jz <2
+    |  // Crossed a write barrier. So move the barrier forward.
+    |  mov ARG2, UPVAL:RB
+    |  mov ARG3, GCOBJ:RA
+    |  mov RB, UPVAL:RB->v
+    |  mov RD, [BASE+RD*8+4]
+    |  mov [RB], GCOBJ:RA
+    |  mov [RB+4], RD
+    |->BC_USETV_Z:
+    |  mov L:RB, SAVE_L
+    |  lea GL:RA, [DISPATCH+GG_DISP2G]
+    |  mov L:RB->base, BASE
+    |  mov ARG1, GL:RA
+    |  call extern lj_gc_barrieruv  // (global_State *g, GCobj *o, GCobj *v)
+    |  mov BASE, L:RB->base
+    |  jmp <3
+    break;
+  case BC_USETS:
+    |  ins_AND	// RA = upvalue #, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov GCOBJ:RD, [KBASE+RD*4]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov RA, UPVAL:RB->v
+    |  mov dword [RA+4], LJ_TSTR
+    |  mov [RA], GCOBJ:RD
+    |  test byte UPVAL:RB->marked, LJ_GC_BLACK		// isblack(uv)
+    |  jnz >2
+    |1:
+    |  ins_next
+    |
+    |2:  // Upvalue is black. Check if string is white.
+    |  test byte GCOBJ:RD->gch.marked, LJ_GC_WHITES	// iswhite(str)
+    |  jz <1
+    |  // Crossed a write barrier. So move the barrier forward.
+    |  mov ARG3, GCOBJ:RD
+    |  mov ARG2, UPVAL:RB
+    |  jmp ->BC_USETV_Z
+    break;
+  case BC_USETN:
+    |  ins_AD	// RA = upvalue #, RD = num const
+    |  mov LFUNC:RB, [BASE-8]
+    |  fld qword [KBASE+RD*8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov RA, UPVAL:RB->v
+    |  fstp qword [RA]
+    |  ins_next
+    break;
+  case BC_USETP:
+    |  ins_AND	// RA = upvalue #, RD = primitive type (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov RA, UPVAL:RB->v
+    |  mov [RA+4], RD
+    |  ins_next
+    break;
+  case BC_UCLO:
+    |  ins_AD	// RA = level, RD = target
+    |  branchPC RD			// Do this first to free RD.
+    |  mov L:RB, SAVE_L
+    |  cmp dword L:RB->openupval, 0
+    |  je >1
+    |  lea RA, [BASE+RA*8]
+    |  mov ARG2, RA
+    |  mov ARG1, L:RB
+    |  mov L:RB->base, BASE
+    |  call extern lj_func_closeuv	// (lua_State *L, StkId level)
+    |  mov BASE, L:RB->base
+    |1:
+    |  ins_next
+    break;
+
+  case BC_FNEW:
+    |  ins_AND	// RA = dst, RD = proto const (~) (holding function prototype)
+    |  mov LFUNC:RA, [BASE-8]
+    |  mov PROTO:RD, [KBASE+RD*4]	// Fetch GCproto *.
+    |  mov L:RB, SAVE_L
+    |  mov ARG3, LFUNC:RA
+    |  mov ARG2, PROTO:RD
+    |  mov SAVE_PC, PC
+    |  mov ARG1, L:RB
+    |  mov L:RB->base, BASE
+    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
+    |  call extern lj_func_newL_gc
+    |  // GCfuncL * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], LFUNC:RC
+    |  mov dword [BASE+RA*8+4], LJ_TFUNC
+    |  ins_next
+    break;
+
+  /* -- Table ops --------------------------------------------------------- */
+
+  case BC_TNEW:
+    |  ins_AD	// RA = dst, RD = hbits|asize
+    |  mov RB, RD
+    |  and RD, 0x7ff
+    |  shr RB, 11
+    |  cmp RD, 0x7ff			// Turn 0x7ff into 0x801.
+    |  sete RAL
+    |  mov ARG3, RB
+    |  add RD, RA
+    |  mov L:RB, SAVE_L
+    |  add RD, RA
+    |  mov ARG2, RD
+    |  mov SAVE_PC, PC
+    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+    |  mov ARG1, L:RB
+    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+    |  mov L:RB->base, BASE
+    |  jae >2
+    |1:
+    |  call extern lj_tab_new  // (lua_State *L, int32_t asize, uint32_t hbits)
+    |  // Table * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], TAB:RC
+    |  mov dword [BASE+RA*8+4], LJ_TTAB
+    |  ins_next
+    |2:
+    |  call extern lj_gc_step_fixtop	// (lua_State *L)
+    |  mov ARG1, L:RB			// Args owned by callee. Set it again.
+    |  jmp <1
+    break;
+  case BC_TDUP:
+    |  ins_AND	// RA = dst, RD = table const (~) (holding template table)
+    |  mov TAB:RD, [KBASE+RD*4]
+    |  mov L:RB, SAVE_L
+    |  mov ARG2, TAB:RD
+    |  mov ARG1, L:RB
+    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+    |  mov SAVE_PC, PC
+    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+    |  mov L:RB->base, BASE
+    |  jae >3
+    |2:
+    |  call extern lj_tab_dup		// (lua_State *L, Table *kt)
+    |  // Table * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], TAB:RC
+    |  mov dword [BASE+RA*8+4], LJ_TTAB
+    |  ins_next
+    |3:
+    |  call extern lj_gc_step_fixtop	// (lua_State *L)
+    |  mov ARG1, L:RB			// Args owned by callee. Set it again.
+    |  jmp <2
+    break;
+
+  case BC_GGET:
+    |  ins_AND	// RA = dst, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov TAB:RB, LFUNC:RB->env
+    |  mov STR:RC, [KBASE+RD*4]
+    |  jmp ->BC_TGETS_Z
+    break;
+  case BC_GSET:
+    |  ins_AND	// RA = src, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov TAB:RB, LFUNC:RB->env
+    |  mov STR:RC, [KBASE+RD*4]
+    |  jmp ->BC_TSETS_Z
+    break;
+
+  case BC_TGETV:
+    |  ins_ABC	// RA = dst, RB = table, RC = key
+    |  checktab RB, ->vmeta_tgetv
+    |  mov TAB:RB, [BASE+RB*8]
+    |
+    |  // Integer key? Convert number to int and back and compare.
+    |  checknum RC, >5
+    |  fld qword [BASE+RC*8]
+    |  fist ARG1
+    |  fild ARG1
+    |  fcomparepp			// eax (RC) modified!
+    |  mov RC, ARG1
+    |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
+    |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
+    |  jae ->vmeta_tgetv		// Not in array part? Use fallback.
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >2
+    |1:
+    |  mov RB, [RC]			// Get array slot.
+    |  mov RC, [RC+4]
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |  ins_next
+    |
+    |2:  // Check for __index if table value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jz ->vmeta_tgetv			// 'no __index' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    |
+    |5:  // String key?
+    |  checkstr RC, ->vmeta_tgetv
+    |  mov STR:RC, [BASE+RC*8]
+    |  jmp ->BC_TGETS_Z
+    break;
+  case BC_TGETS:
+    |  ins_ABC	// RA = dst, RB = table, RC = str const (~)
+    |  not RC
+    |  mov STR:RC, [KBASE+RC*4]
+    |  checktab RB, ->vmeta_tgets
+    |  mov TAB:RB, [BASE+RB*8]
+    |->BC_TGETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
+    |  mov RA, TAB:RB->hmask
+    |  and RA, STR:RC->hash
+    |  imul RA, #NODE
+    |  add NODE:RA, TAB:RB->node
+    |1:
+    |  cmp dword NODE:RA->key.it, LJ_TSTR
+    |  jne >4
+    |  cmp dword NODE:RA->key.gcr, STR:RC
+    |  jne >4
+    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
+    |  cmp dword [RA+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >5				// Key found, but nil value?
+    |  movzx RC, PC_RA
+    |  mov RB, [RA]			// Get node value.
+    |  mov RA, [RA+4]
+    |  mov [BASE+RC*8], RB
+    |2:
+    |  mov [BASE+RC*8+4], RA
+    |  ins_next
+    |
+    |3:
+    |  movzx RC, PC_RA
+    |  mov RA, LJ_TNIL
+    |  jmp <2
+    |
+    |4:  // Follow hash chain.
+    |  mov NODE:RA, NODE:RA->next
+    |  test NODE:RA, NODE:RA
+    |  jnz <1
+    |  // End of hash chain: key not found, nil result.
+    |
+    |5:  // Check for __index if table value is nil.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test TAB:RA, TAB:RA
+    |  jz <3				// No metatable: done.
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jnz <3				// 'no __index' flag set: done.
+    |  jmp ->vmeta_tgets		// Caveat: preserve STR:RC.
+    break;
+  case BC_TGETB:
+    |  ins_ABC	// RA = dst, RB = table, RC = byte literal
+    |  checktab RB, ->vmeta_tgetb
+    |  mov TAB:RB, [BASE+RB*8]
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tgetb
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >2
+    |1:
+    |  mov RB, [RC]			// Get array slot.
+    |  mov RC, [RC+4]
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |  ins_next
+    |
+    |2:  // Check for __index if table value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jz ->vmeta_tgetb			// 'no __index' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    break;
+
+  case BC_TSETV:
+    |  ins_ABC	// RA = src, RB = table, RC = key
+    |  checktab RB, ->vmeta_tsetv
+    |  mov TAB:RB, [BASE+RB*8]
+    |
+    |  // Integer key? Convert number to int and back and compare.
+    |  checknum RC, >5
+    |  fld qword [BASE+RC*8]
+    |  fist ARG1
+    |  fild ARG1
+    |  fcomparepp			// eax (RC) modified!
+    |  mov RC, ARG1
+    |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
+    |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
+    |  jae ->vmeta_tsetv
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL
+    |  je >3				// Previous value is nil?
+    |1:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:
+    |  mov RB, [BASE+RA*8+4]		// Set array slot.
+    |  mov RA, [BASE+RA*8]
+    |  mov [RC+4], RB
+    |  mov [RC], RA
+    |  ins_next
+    |
+    |3:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsetv			// 'no __newindex' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    |
+    |5:  // String key?
+    |  checkstr RC, ->vmeta_tsetv
+    |  mov STR:RC, [BASE+RC*8]
+    |  jmp ->BC_TSETS_Z
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RA
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <2
+    break;
+  case BC_TSETS:
+    |  ins_ABC	// RA = src, RB = table, RC = str const (~)
+    |  not RC
+    |  mov STR:RC, [KBASE+RC*4]
+    |  checktab RB, ->vmeta_tsets
+    |  mov TAB:RB, [BASE+RB*8]
+    |->BC_TSETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
+    |  mov RA, TAB:RB->hmask
+    |  and RA, STR:RC->hash
+    |  imul RA, #NODE
+    |  mov byte TAB:RB->nomm, 0		// Clear metamethod cache.
+    |  add NODE:RA, TAB:RB->node
+    |1:
+    |  cmp dword NODE:RA->key.it, LJ_TSTR
+    |  jne >5
+    |  cmp dword NODE:RA->key.gcr, STR:RC
+    |  jne >5
+    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
+    |  cmp dword [RA+4], LJ_TNIL
+    |  je >4				// Previous value is nil?
+    |2:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |3:
+    |  movzx RC, PC_RA
+    |  mov RB, [BASE+RC*8+4]		// Set node value.
+    |  mov RC, [BASE+RC*8]
+    |  mov [RA+4], RB
+    |  mov [RA], RC
+    |  ins_next
+    |
+    |4:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <2
+    |  mov ARG1, RA			// Save RA.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
+    |  mov RA, ARG1			// Restore RA.
+    |  jmp <2
+    |
+    |5:  // Follow hash chain.
+    |  mov NODE:RA, NODE:RA->next
+    |  test NODE:RA, NODE:RA
+    |  jnz <1
+    |  // End of hash chain: key not found, add a new one.
+    |
+    |  // But check for __newindex first.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test TAB:RA, TAB:RA
+    |  jz >6				// No metatable: continue.
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
+    |6:
+    |  mov ARG5, STR:RC
+    |  mov ARG6, LJ_TSTR
+    |  lea RC, ARG5			// Store temp. TValue in ARG5/ARG6.
+    |  mov ARG4, TAB:RB			// Save TAB:RB for us.
+    |  mov ARG2, TAB:RB
+    |  mov L:RB, SAVE_L
+    |  mov ARG3, RC
+    |  mov ARG1, L:RB
+    |  mov SAVE_PC, PC
+    |  mov L:RB->base, BASE
+    |  call extern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k)
+    |  // Handles write barrier for the new key. TValue * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  mov TAB:RB, ARG4			// Need TAB:RB for barrier.
+    |  mov RA, eax
+    |  jmp <2				// Must check write barrier for value.
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RC		// Destroys STR:RC.
+    |  jmp <3
+    break;
+  case BC_TSETB:
+    |  ins_ABC	// RA = src, RB = table, RC = byte literal
+    |  checktab RB, ->vmeta_tsetb
+    |  mov TAB:RB, [BASE+RB*8]
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tsetb
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL
+    |  je >3				// Previous value is nil?
+    |1:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:
+    |  mov RB, [BASE+RA*8+4]		// Set array slot.
+    |  mov RA, [BASE+RA*8]
+    |  mov [RC+4], RB
+    |  mov [RC], RA
+    |  ins_next
+    |
+    |3:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsetb			// 'no __newindex' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RA
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <2
+    break;
+
+  case BC_TSETM:
+    |  ins_AD	// RA = base (table at base-1), RD = num const (start index)
+    |  mov ARG5, KBASE			// Need one more free register.
+    |  fld qword [KBASE+RD*8]
+    |  fistp ARG4			// Const is guaranteed to be an int.
+    |1:
+    |  lea RA, [BASE+RA*8]
+    |  mov TAB:RB, [RA-8]		// Guaranteed to be a table.
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:
+    |  mov RD, NRESULTS
+    |  mov KBASE, ARG4
+    |  sub RD, 1
+    |  jz >4				// Nothing to copy?
+    |  add RD, KBASE			// Compute needed size.
+    |  cmp RD, TAB:RB->asize
+    |  jae >5				// Does not fit into array part?
+    |  sub RD, KBASE
+    |  shl KBASE, 3
+    |  add KBASE, TAB:RB->array
+    |3:  // Copy result slots to table.
+    |  mov RB, [RA]
+    |  mov [KBASE], RB
+    |  mov RB, [RA+4]
+    |  add RA, 8
+    |  mov [KBASE+4], RB
+    |  add KBASE, 8
+    |  sub RD, 1
+    |  jnz <3
+    |4:
+    |  mov KBASE, ARG5
+    |  ins_next
+    |
+    |5:  // Need to resize array part.
+    |  mov ARG2, TAB:RB
+    |  mov L:RB, SAVE_L
+    |  mov ARG3, RD
+    |  mov ARG1, L:RB
+    |  mov SAVE_PC, PC
+    |  mov L:RB->base, BASE
+    |  call extern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1				// Retry.
+    |
+    |7:  // Possible table write barrier for any value. Skip valiswhite check.
+    |  barrierback TAB:RB, RD
+    |  jmp <2
+    break;
+
+  /* -- Calls and vararg handling ----------------------------------------- */
+
+  case BC_CALL: case BC_CALLM:
+    |  ins_A_C	// RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
+    if (op == BC_CALLM) {
+      |  add NARGS:RC, NRESULTS
+    }
+    |  lea RA, [BASE+RA*8+8]
+    |  mov LFUNC:RB, [RA-8]
+    |  cmp dword [RA-4], LJ_TFUNC
+    |  jne ->vmeta_call
+    |  jmp aword LFUNC:RB->gate
+    break;
+
+  case BC_CALLMT:
+    |  ins_AD	// RA = base, RD = extra_nargs
+    |  add NARGS:RD, NRESULTS
+    |  // Fall through. Assumes BC_CALLMT follows and ins_AD is a no-op.
+    break;
+  case BC_CALLT:
+    |  ins_AD	// RA = base, RD = nargs+1
+    |  lea RA, [BASE+RA*8+8]
+    |  mov KBASE, BASE			// Use KBASE for move + vmeta_call hint.
+    |  mov LFUNC:RB, [RA-8]
+    |  cmp dword [RA-4], LJ_TFUNC
+    |  jne ->vmeta_call
+    |->BC_CALLT_Z:
+    |  mov PC, [BASE-4]
+    |  test PC, FRAME_TYPE
+    |  jnz >7
+    |1:
+    |  mov [BASE-8], LFUNC:RB		// Copy function down, reloaded below.
+    |  mov NRESULTS, NARGS:RD
+    |  sub NARGS:RD, 1
+    |  jz >3
+    |2:
+    |  mov RB, [RA]			// Move args down.
+    |  mov [KBASE], RB
+    |  mov RB, [RA+4]
+    |  mov [KBASE+4], RB
+    |  add KBASE, 8
+    |  add RA, 8
+    |  sub NARGS:RD, 1
+    |  jnz <2
+    |
+    |  mov LFUNC:RB, [BASE-8]
+    |3:
+    |  mov RA, BASE			// BASE is ignored, except when ...
+    |  cmp byte LFUNC:RB->ffid, 1	// (> FF_C) Calling a fast function?
+    |  ja >5
+    |4:
+    |  mov NARGS:RD, NRESULTS
+    |  jmp aword LFUNC:RB->gate
+    |
+    |5:  // Tailcall to a fast function.
+    |  test PC, FRAME_TYPE		// Lua frame below?
+    |  jnz <4
+    |  movzx RD, PC_RA			// Need to prepare BASE/KBASE.
+    |  not RD
+    |  lea BASE, [BASE+RD*8]
+    |  mov LFUNC:KBASE, [BASE-8]
+    |  mov PROTO:KBASE, LFUNC:KBASE->pt
+    |  mov KBASE, PROTO:KBASE->k
+    |  jmp <4
+    |
+    |7:  // Tailcall from a vararg function.
+    |  jnp <1				// Vararg frame below?
+    |  and PC, -8
+    |  sub BASE, PC			// Need to relocate BASE/KBASE down.
+    |  mov KBASE, BASE
+    |  mov PC, [BASE-4]
+    |  jmp <1
+    break;
+
+  case BC_ITERC:
+    |  ins_A	// RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
+    |  lea RA, [BASE+RA*8+8]		// fb = base+1
+    |  mov RB, [RA-24]			// Copy state. fb[0] = fb[-3].
+    |  mov RC, [RA-20]
+    |  mov [RA], RB
+    |  mov [RA+4], RC
+    |  mov RB, [RA-16]			// Copy control var. fb[1] = fb[-2].
+    |  mov RC, [RA-12]
+    |  mov [RA+8], RB
+    |  mov [RA+12], RC
+    |  mov LFUNC:RB, [RA-32]		// Copy callable. fb[-1] = fb[-4]
+    |  mov RC, [RA-28]
+    |  mov [RA-8], LFUNC:RB
+    |  mov [RA-4], RC
+    |  cmp RC, LJ_TFUNC			// Handle like a regular 2-arg call.
+    |  mov NARGS:RC, 3
+    |  jne ->vmeta_call
+    |  jmp aword LFUNC:RB->gate
+    break;
+
+  case BC_VARG:
+    |  ins_AB_	// RA = base, RB = nresults+1, (RC = 1)
+    |  mov LFUNC:RC, [BASE-8]
+    |  lea RA, [BASE+RA*8]
+    |  mov PROTO:RC, LFUNC:RC->pt
+    |  movzx RC, byte PROTO:RC->numparams
+    |  mov ARG3, KBASE			// Need one more free register.
+    |  lea KBASE, [BASE+RC*8+(8+FRAME_VARG)]
+    |  sub KBASE, [BASE-4]
+    |  // Note: KBASE may now be even _above_ BASE if nargs was < numparams.
+    |  test RB, RB
+    |  jz >5				// Copy all varargs?
+    |  lea RB, [RA+RB*8-8]
+    |  cmp KBASE, BASE			// No vararg slots?
+    |  jnb >2
+    |1:  // Copy vararg slots to destination slots.
+    |  mov RC, [KBASE-8]
+    |  mov [RA], RC
+    |  mov RC, [KBASE-4]
+    |  add KBASE, 8
+    |  mov [RA+4], RC
+    |  add RA, 8
+    |  cmp RA, RB			// All destination slots filled?
+    |  jnb >3
+    |  cmp KBASE, BASE			// No more vararg slots?
+    |  jb <1
+    |2:  // Fill up remainder with nil.
+    |  mov dword [RA+4], LJ_TNIL
+    |  add RA, 8
+    |  cmp RA, RB
+    |  jb <2
+    |3:
+    |  mov KBASE, ARG3
+    |  ins_next
+    |
+    |5:  // Copy all varargs.
+    |  mov NRESULTS, 1			// NRESULTS = 0+1
+    |  mov RC, BASE
+    |  sub RC, KBASE
+    |  jbe <3				// No vararg slots?
+    |  mov RB, RC
+    |  shr RB, 3
+    |  mov ARG2, RB			// Store this for stack growth below.
+    |  add RB, 1
+    |  mov NRESULTS, RB			// NRESULTS = #varargs+1
+    |  mov L:RB, SAVE_L
+    |  add RC, RA
+    |  cmp RC, L:RB->maxstack
+    |  ja >7				// Need to grow stack?
+    |6:  // Copy all vararg slots.
+    |  mov RC, [KBASE-8]
+    |  mov [RA], RC
+    |  mov RC, [KBASE-4]
+    |  add KBASE, 8
+    |  mov [RA+4], RC
+    |  add RA, 8
+    |  cmp KBASE, BASE			// No more vararg slots?
+    |  jb <6
+    |  jmp <3
+    |
+    |7:  // Grow stack for varargs.
+    |  mov L:RB->base, BASE
+    |  mov L:RB->top, RA
+    |  mov SAVE_PC, PC
+    |  sub KBASE, BASE			// Need delta, because BASE may change.
+    |  mov ARG1, L:RB
+    |  call extern lj_state_growstack	// (lua_State *L, int n)
+    |  mov BASE, L:RB->base
+    |  mov RA, L:RB->top
+    |  add KBASE, BASE
+    |  jmp <6
+    break;
+
+  /* -- Returns ----------------------------------------------------------- */
+
+  case BC_RETM:
+    |  ins_AD	// RA = results, RD = extra_nresults
+    |  add RD, NRESULTS			// NRESULTS >=1, so RD >=1.
+    |  // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
+    break;
+
+  case BC_RET: case BC_RET0: case BC_RET1:
+    |  ins_AD	// RA = results, RD = nresults+1
+    if (op != BC_RET0) {
+      |  shl RA, 3
+    }
+    |1:
+    |  mov PC, [BASE-4]
+    |  mov NRESULTS, RD			// Save nresults+1.
+    |  test PC, FRAME_TYPE		// Check frame type marker.
+    |  jnz >7				// Not returning to a fixarg Lua func?
+    switch (op) {
+    case BC_RET:
+      |->BC_RET_Z:
+      |  mov KBASE, BASE		// Use KBASE for result move.
+      |  sub RD, 1
+      |  jz >3
+      |2:
+      |  mov RB, [KBASE+RA]		// Move results down.
+      |  mov [KBASE-8], RB
+      |  mov RB, [KBASE+RA+4]
+      |  mov [KBASE-4], RB
+      |  add KBASE, 8
+      |  sub RD, 1
+      |  jnz <2
+      |3:
+      |  mov RD, NRESULTS		// Note: NRESULTS may be >255.
+      |  movzx RB, PC_RB		// So cannot compare with RDL!
+      |5:
+      |  cmp RB, RD			// More results expected?
+      |  ja >6
+      break;
+    case BC_RET1:
+      |  mov RB, [BASE+RA+4]
+      |  mov [BASE-4], RB
+      |  mov RB, [BASE+RA]
+      |  mov [BASE-8], RB
+      /* fallthrough */
+    case BC_RET0:
+      |5:
+      |  cmp PC_RB, RDL			// More results expected?
+      |  ja >6
+    default:
+      break;
+    }
+    |  movzx RA, PC_RA
+    |  not RA				// Note: ~RA = -(RA+1)
+    |  lea BASE, [BASE+RA*8]		// base = base - (RA+1)*8
+    |  mov LFUNC:KBASE, [BASE-8]
+    |  mov PROTO:KBASE, LFUNC:KBASE->pt
+    |  mov KBASE, PROTO:KBASE->k
+    |  ins_next
+    |
+    |6:  // Fill up results with nil.
+    if (op == BC_RET) {
+      |  mov dword [KBASE-4], LJ_TNIL	// Note: relies on shifted base.
+      |  add KBASE, 8
+    } else {
+      |  mov dword [BASE+RD*8-12], LJ_TNIL
+    }
+    |  add RD, 1
+    |  jmp <5
+    |
+    |7:  // Non-standard return case.
+    |  jnp ->vm_return
+    |  // Return from vararg function: relocate BASE down and RA up.
+    |  and PC, -8
+    |  sub BASE, PC
+    if (op != BC_RET0) {
+      |  add RA, PC
+    }
+    |  jmp <1
+    break;
+
+  /* -- Loops and branches ------------------------------------------------ */
+
+  |.define FOR_IDX,  qword [RA];    .define FOR_TIDX,  dword [RA+4]
+  |.define FOR_STOP, qword [RA+8];  .define FOR_TSTOP, dword [RA+12]
+  |.define FOR_STEP, qword [RA+16]; .define FOR_TSTEP, dword [RA+20]
+  |.define FOR_EXT,  qword [RA+24]; .define FOR_TEXT,  dword [RA+28]
+
+  case BC_FORL:
+#if LJ_HASJIT
+    |  hotloop RB
+#endif
+    | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
+    break;
+
+  case BC_JFORI:
+  case BC_JFORL:
+#if !LJ_HASJIT
+    break;
+#endif
+  case BC_FORI:
+  case BC_IFORL:
+    vk = (op == BC_IFORL || op == BC_JFORL);
+    |  ins_AJ	// RA = base, RD = target (after end of loop or start of loop)
+    |  lea RA, [BASE+RA*8]
+    if (!vk) {
+      |  cmp FOR_TIDX, LJ_TISNUM; ja ->vmeta_for	// Type checks
+      |  cmp FOR_TSTOP, LJ_TISNUM; ja ->vmeta_for
+    }
+    |  mov RB, FOR_TSTEP		// Load type/hiword of for step.
+    if (!vk) {
+      |  cmp RB, LJ_TISNUM; ja ->vmeta_for
+    }
+    |  fld FOR_STOP
+    |  fld FOR_IDX
+    if (vk) {
+      |  fadd FOR_STEP			// nidx = idx + step
+      |  fst FOR_IDX
+    }
+    |  fst FOR_EXT
+    |  test RB, RB			// Swap lim/(n)idx if step non-negative.
+    |  js >1
+    |  fxch
+    |1:
+    |  fcomparepp			// eax (RD) modified if !cmov.
+    if (!cmov) {
+      |  movzx RD, PC_RD		// Need to reload RD.
+    }
+    if (op == BC_FORI) {
+      |  jnb >2
+      |  branchPC RD
+    } else if (op == BC_JFORI) {
+      |  branchPC RD
+      |  movzx RD, PC_RD
+      |  jnb =>BC_JLOOP
+    } else if (op == BC_IFORL) {
+      |  jb >2
+      |  branchPC RD
+    } else {
+      |  jnb =>BC_JLOOP
+    }
+    |2:
+    |  ins_next
+    break;
+
+  case BC_ITERL:
+#if LJ_HASJIT
+    |  hotloop RB
+#endif
+    | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
+    break;
+
+  case BC_JITERL:
+#if !LJ_HASJIT
+    break;
+#endif
+  case BC_IITERL:
+    |  ins_AJ	// RA = base, RD = target
+    |  lea RA, [BASE+RA*8]
+    |  mov RB, [RA+4]
+    |  cmp RB, LJ_TNIL; je >1		// Stop if iterator returned nil.
+    if (op == BC_JITERL) {
+      |  mov [RA-4], RB
+      |  mov RB, [RA]
+      |  mov [RA-8], RB
+      |  jmp =>BC_JLOOP
+    } else {
+      |  branchPC RD			// Otherwise save control var + branch.
+      |  mov RD, [RA]
+      |  mov [RA-4], RB
+      |  mov [RA-8], RD
+    }
+    |1:
+    |  ins_next
+    break;
+
+  case BC_LOOP:
+    |  ins_A	// RA = base, RD = target (loop extent)
+    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
+    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
+#if LJ_HASJIT
+    |  hotloop RB
+#endif
+    | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
+    break;
+
+  case BC_ILOOP:
+    |  ins_A	// RA = base, RD = target (loop extent)
+    |  ins_next
+    break;
+
+  case BC_JLOOP:
+#if LJ_HASJIT
+    |  ins_AD	// RA = base (ignored), RD = traceno
+    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+    |  mov TRACE:RD, [RA+RD*4]
+    |  mov RD, TRACE:RD->mcode
+    |  mov L:RB, SAVE_L
+    |  mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
+    |  mov [DISPATCH+DISPATCH_GL(jit_L)], L:RB
+    |  jmp RD
+#endif
+    break;
+
+  case BC_JMP:
+    |  ins_AJ	// RA = unused, RD = target
+    |  branchPC RD
+    |  ins_next
+    break;
+
+  /* ---------------------------------------------------------------------- */
+
+  default:
+    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
+    exit(2);
+    break;
+  }
+}
+
+static int build_backend(BuildCtx *ctx)
+{
+  int op;
+  int cmov = 1;
+#ifdef LUAJIT_CPU_NOCMOV
+  cmov = 0;
+#endif
+
+  dasm_growpc(Dst, BC__MAX);
+
+  build_subroutines(ctx, cmov);
+
+  |.code_op
+  for (op = 0; op < BC__MAX; op++)
+    build_ins(ctx, (BCOp)op, op, cmov);
+
+  return BC__MAX;
+}
+
+/* Emit pseudo frame-info for all assembler functions. */
+static void emit_asm_debug(BuildCtx *ctx)
+{
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
+    fprintf(ctx->fp,
+	".Lframe0:\n"
+	"\t.long .LECIE0-.LSCIE0\n"
+	".LSCIE0:\n"
+	"\t.long 0xffffffff\n"
+	"\t.byte 0x1\n"
+	"\t.string \"\"\n"
+	"\t.uleb128 0x1\n"
+	"\t.sleb128 -4\n"
+	"\t.byte 0x8\n"
+	"\t.byte 0xc\n\t.uleb128 0x4\n\t.uleb128 0x4\n"
+	"\t.byte 0x88\n\t.uleb128 0x1\n"
+	"\t.align 4\n"
+	".LECIE0:\n\n");
+    fprintf(ctx->fp,
+	".LSFDE0:\n"
+	"\t.long .LEFDE0-.LASFDE0\n"
+	".LASFDE0:\n"
+	"\t.long .Lframe0\n"
+	"\t.long .Lbegin\n"
+	"\t.long %d\n"
+	"\t.byte 0xe\n\t.uleb128 0x30\n"	/* def_cfa_offset */
+	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
+	"\t.byte 0x87\n\t.uleb128 0x3\n"	/* offset edi */
+	"\t.byte 0x86\n\t.uleb128 0x4\n"	/* offset esi */
+	"\t.byte 0x83\n\t.uleb128 0x5\n"	/* offset ebx */
+	"\t.align 4\n"
+	".LEFDE0:\n\n", (int)ctx->codesz);
+    break;
+  default:  /* Difficult for other modes. */
+    break;
+  }
+}
+

+ 159 - 0
src/lauxlib.h

@@ -0,0 +1,159 @@
+/*
+** $Id: lauxlib.h,v 1.88.1.1 2007/12/27 13:02:25 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lauxlib_h
+#define lauxlib_h
+
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "lua.h"
+
+
+#define luaL_getn(L,i)          ((int)lua_objlen(L, i))
+#define luaL_setn(L,i,j)        ((void)0)  /* no op! */
+
+/* extra error code for `luaL_load' */
+#define LUA_ERRFILE     (LUA_ERRERR+1)
+
+typedef struct luaL_Reg {
+  const char *name;
+  lua_CFunction func;
+} luaL_Reg;
+
+LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname,
+                                const luaL_Reg *l, int nup);
+LUALIB_API void (luaL_register) (lua_State *L, const char *libname,
+                                const luaL_Reg *l);
+LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e);
+LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e);
+LUALIB_API int (luaL_typerror) (lua_State *L, int narg, const char *tname);
+LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg);
+LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg,
+                                                          size_t *l);
+LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg,
+                                          const char *def, size_t *l);
+LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg);
+LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def);
+
+LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg);
+LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg,
+                                          lua_Integer def);
+
+LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg);
+LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t);
+LUALIB_API void (luaL_checkany) (lua_State *L, int narg);
+
+LUALIB_API int   (luaL_newmetatable) (lua_State *L, const char *tname);
+LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname);
+
+LUALIB_API void (luaL_where) (lua_State *L, int lvl);
+LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...);
+
+LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def,
+                                   const char *const lst[]);
+
+LUALIB_API int (luaL_ref) (lua_State *L, int t);
+LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref);
+
+LUALIB_API int (luaL_loadfile) (lua_State *L, const char *filename);
+LUALIB_API int (luaL_loadbuffer) (lua_State *L, const char *buff, size_t sz,
+                                  const char *name);
+LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s);
+
+LUALIB_API lua_State *(luaL_newstate) (void);
+
+
+LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p,
+                                                  const char *r);
+
+LUALIB_API const char *(luaL_findtable) (lua_State *L, int idx,
+                                         const char *fname, int szhint);
+
+
+
+
+/*
+** ===============================================================
+** some useful macros
+** ===============================================================
+*/
+
+#define luaL_argcheck(L, cond,numarg,extramsg)	\
+		((void)((cond) || luaL_argerror(L, (numarg), (extramsg))))
+#define luaL_checkstring(L,n)	(luaL_checklstring(L, (n), NULL))
+#define luaL_optstring(L,n,d)	(luaL_optlstring(L, (n), (d), NULL))
+#define luaL_checkint(L,n)	((int)luaL_checkinteger(L, (n)))
+#define luaL_optint(L,n,d)	((int)luaL_optinteger(L, (n), (d)))
+#define luaL_checklong(L,n)	((long)luaL_checkinteger(L, (n)))
+#define luaL_optlong(L,n,d)	((long)luaL_optinteger(L, (n), (d)))
+
+#define luaL_typename(L,i)	lua_typename(L, lua_type(L,(i)))
+
+#define luaL_dofile(L, fn) \
+	(luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_dostring(L, s) \
+	(luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_getmetatable(L,n)	(lua_getfield(L, LUA_REGISTRYINDEX, (n)))
+
+#define luaL_opt(L,f,n,d)	(lua_isnoneornil(L,(n)) ? (d) : f(L,(n)))
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+
+
+typedef struct luaL_Buffer {
+  char *p;			/* current position in buffer */
+  int lvl;  /* number of strings in the stack (level) */
+  lua_State *L;
+  char buffer[LUAL_BUFFERSIZE];
+} luaL_Buffer;
+
+#define luaL_addchar(B,c) \
+  ((void)((B)->p < ((B)->buffer+LUAL_BUFFERSIZE) || luaL_prepbuffer(B)), \
+   (*(B)->p++ = (char)(c)))
+
+/* compatibility only */
+#define luaL_putchar(B,c)	luaL_addchar(B,c)
+
+#define luaL_addsize(B,n)	((B)->p += (n))
+
+LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B);
+LUALIB_API char *(luaL_prepbuffer) (luaL_Buffer *B);
+LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l);
+LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s);
+LUALIB_API void (luaL_addvalue) (luaL_Buffer *B);
+LUALIB_API void (luaL_pushresult) (luaL_Buffer *B);
+
+
+/* }====================================================== */
+
+
+/* compatibility with ref system */
+
+/* pre-defined references */
+#define LUA_NOREF       (-2)
+#define LUA_REFNIL      (-1)
+
+#define lua_ref(L,lock) ((lock) ? luaL_ref(L, LUA_REGISTRYINDEX) : \
+      (lua_pushstring(L, "unlocked references are obsolete"), lua_error(L), 0))
+
+#define lua_unref(L,ref)        luaL_unref(L, LUA_REGISTRYINDEX, (ref))
+
+#define lua_getref(L,ref)       lua_rawgeti(L, LUA_REGISTRYINDEX, (ref))
+
+
+#define luaL_reg	luaL_Reg
+
+#endif

+ 438 - 0
src/lib_aux.c

@@ -0,0 +1,438 @@
+/*
+** Auxiliary library for the Lua/C API.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major parts taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define lib_aux_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_lib.h"
+
+/* convert a stack index to positive */
+#define abs_index(L, i) \
+  ((i) > 0 || (i) <= LUA_REGISTRYINDEX ? (i) : lua_gettop(L) + (i) + 1)
+
+/* -- Type checks --------------------------------------------------------- */
+
+LUALIB_API void luaL_checkstack(lua_State *L, int size, const char *msg)
+{
+  if (!lua_checkstack(L, size))
+    lj_err_callerv(L, LJ_ERR_STKOVM, msg);
+}
+
+LUALIB_API void luaL_checktype(lua_State *L, int narg, int tt)
+{
+  if (lua_type(L, narg) != tt)
+    lj_err_argt(L, narg, tt);
+}
+
+LUALIB_API void luaL_checkany(lua_State *L, int narg)
+{
+  lj_lib_checkany(L, narg);
+}
+
+LUALIB_API const char *luaL_checklstring(lua_State *L, int narg, size_t *len)
+{
+  GCstr *s = lj_lib_checkstr(L, narg);
+  if (len != NULL) *len = s->len;
+  return strdata(s);
+}
+
+LUALIB_API const char *luaL_optlstring(lua_State *L, int narg,
+				       const char *def, size_t *len)
+{
+  GCstr *s = lj_lib_optstr(L, narg);
+  if (s) {
+    if (len != NULL) *len = s->len;
+    return strdata(s);
+  }
+  if (len != NULL) *len = def ? strlen(def) : 0;
+  return def;
+}
+
+LUALIB_API lua_Number luaL_checknumber(lua_State *L, int narg)
+{
+  return lj_lib_checknum(L, narg);
+}
+
+LUALIB_API lua_Number luaL_optnumber(lua_State *L, int narg, lua_Number def)
+{
+  lj_lib_opt(L, narg,
+    return lj_lib_checknum(L, narg);
+  ,
+    return def;
+  )
+}
+
+LUALIB_API lua_Integer luaL_checkinteger(lua_State *L, int narg)
+{
+#if LJ_64
+  return (lua_Integer)lj_lib_checknum(L, narg);
+#else
+  return lj_lib_checkint(L, narg);
+#endif
+}
+
+LUALIB_API lua_Integer luaL_optinteger(lua_State *L, int narg, lua_Integer def)
+{
+#if LJ_64
+  lj_lib_opt(L, narg,
+    return (lua_Integer)lj_lib_checknum(L, narg);
+  ,
+    return def;
+  )
+#else
+  return lj_lib_optint(L, narg, def);
+#endif
+}
+
+LUALIB_API int luaL_checkoption(lua_State *L, int narg, const char *def,
+				const char *const lst[])
+{
+  GCstr *s = lj_lib_optstr(L, narg);
+  const char *opt = s ? strdata(s) : def;
+  uint32_t i;
+  if (!opt) lj_err_argt(L, narg, LUA_TSTRING);
+  for (i = 0; lst[i]; i++)
+    if (strcmp(lst[i], opt) == 0)
+      return (int)i;
+  lj_err_argv(L, narg, LJ_ERR_INVOPTM, opt);
+}
+
+/* -- Module registration ------------------------------------------------- */
+
+LUALIB_API const char *luaL_findtable(lua_State *L, int idx,
+				      const char *fname, int szhint)
+{
+  const char *e;
+  lua_pushvalue(L, idx);
+  do {
+    e = strchr(fname, '.');
+    if (e == NULL) e = fname + strlen(fname);
+    lua_pushlstring(L, fname, (size_t)(e - fname));
+    lua_rawget(L, -2);
+    if (lua_isnil(L, -1)) {  /* no such field? */
+      lua_pop(L, 1);  /* remove this nil */
+      lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
+      lua_pushlstring(L, fname, (size_t)(e - fname));
+      lua_pushvalue(L, -2);
+      lua_settable(L, -4);  /* set new table into field */
+    } else if (!lua_istable(L, -1)) {  /* field has a non-table value? */
+      lua_pop(L, 2);  /* remove table and value */
+      return fname;  /* return problematic part of the name */
+    }
+    lua_remove(L, -2);  /* remove previous table */
+    fname = e + 1;
+  } while (*e == '.');
+  return NULL;
+}
+
+static int libsize(const luaL_Reg *l)
+{
+  int size = 0;
+  for (; l->name; l++) size++;
+  return size;
+}
+
+LUALIB_API void luaL_openlib(lua_State *L, const char *libname,
+			     const luaL_Reg *l, int nup)
+{
+  if (libname) {
+    int size = libsize(l);
+    /* check whether lib already exists */
+    luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 16);
+    lua_getfield(L, -1, libname);  /* get _LOADED[libname] */
+    if (!lua_istable(L, -1)) {  /* not found? */
+      lua_pop(L, 1);  /* remove previous result */
+      /* try global variable (and create one if it does not exist) */
+      if (luaL_findtable(L, LUA_GLOBALSINDEX, libname, size) != NULL)
+	lj_err_callerv(L, LJ_ERR_BADMODN, libname);
+      lua_pushvalue(L, -1);
+      lua_setfield(L, -3, libname);  /* _LOADED[libname] = new table */
+    }
+    lua_remove(L, -2);  /* remove _LOADED table */
+    lua_insert(L, -(nup+1));  /* move library table to below upvalues */
+  }
+  for (; l->name; l++) {
+    int i;
+    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
+      lua_pushvalue(L, -nup);
+    lua_pushcclosure(L, l->func, nup);
+    lua_setfield(L, -(nup+2), l->name);
+  }
+  lua_pop(L, nup);  /* remove upvalues */
+}
+
+LUALIB_API void luaL_register(lua_State *L, const char *libname,
+			      const luaL_Reg *l)
+{
+  luaL_openlib(L, libname, l, 0);
+}
+
+LUALIB_API const char *luaL_gsub(lua_State *L, const char *s,
+				 const char *p, const char *r)
+{
+  const char *wild;
+  size_t l = strlen(p);
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while ((wild = strstr(s, p)) != NULL) {
+    luaL_addlstring(&b, s, (size_t)(wild - s));  /* push prefix */
+    luaL_addstring(&b, r);  /* push replacement in place of pattern */
+    s = wild + l;  /* continue after `p' */
+  }
+  luaL_addstring(&b, s);  /* push last suffix */
+  luaL_pushresult(&b);
+  return lua_tostring(L, -1);
+}
+
+/* -- Buffer handling ----------------------------------------------------- */
+
+#define bufflen(B)	((size_t)((B)->p - (B)->buffer))
+#define bufffree(B)	((size_t)(LUAL_BUFFERSIZE - bufflen(B)))
+
+static int emptybuffer(luaL_Buffer *B)
+{
+  size_t l = bufflen(B);
+  if (l == 0)
+    return 0;  /* put nothing on stack */
+  lua_pushlstring(B->L, B->buffer, l);
+  B->p = B->buffer;
+  B->lvl++;
+  return 1;
+}
+
+static void adjuststack(luaL_Buffer *B)
+{
+  if (B->lvl > 1) {
+    lua_State *L = B->L;
+    int toget = 1;  /* number of levels to concat */
+    size_t toplen = lua_strlen(L, -1);
+    do {
+      size_t l = lua_strlen(L, -(toget+1));
+      if (!(B->lvl - toget + 1 >= LUA_MINSTACK/2 || toplen > l))
+	break;
+      toplen += l;
+      toget++;
+    } while (toget < B->lvl);
+    lua_concat(L, toget);
+    B->lvl = B->lvl - toget + 1;
+  }
+}
+
+LUALIB_API char *luaL_prepbuffer(luaL_Buffer *B)
+{
+  if (emptybuffer(B))
+    adjuststack(B);
+  return B->buffer;
+}
+
+LUALIB_API void luaL_addlstring(luaL_Buffer *B, const char *s, size_t l)
+{
+  while (l--)
+    luaL_addchar(B, *s++);
+}
+
+LUALIB_API void luaL_addstring(luaL_Buffer *B, const char *s)
+{
+  luaL_addlstring(B, s, strlen(s));
+}
+
+LUALIB_API void luaL_pushresult(luaL_Buffer *B)
+{
+  emptybuffer(B);
+  lua_concat(B->L, B->lvl);
+  B->lvl = 1;
+}
+
+LUALIB_API void luaL_addvalue(luaL_Buffer *B)
+{
+  lua_State *L = B->L;
+  size_t vl;
+  const char *s = lua_tolstring(L, -1, &vl);
+  if (vl <= bufffree(B)) {  /* fit into buffer? */
+    memcpy(B->p, s, vl);  /* put it there */
+    B->p += vl;
+    lua_pop(L, 1);  /* remove from stack */
+  } else {
+    if (emptybuffer(B))
+      lua_insert(L, -2);  /* put buffer before new value */
+    B->lvl++;  /* add new value into B stack */
+    adjuststack(B);
+  }
+}
+
+LUALIB_API void luaL_buffinit(lua_State *L, luaL_Buffer *B)
+{
+  B->L = L;
+  B->p = B->buffer;
+  B->lvl = 0;
+}
+
+/* -- Reference management ------------------------------------------------ */
+
+#define FREELIST_REF	0
+
+LUALIB_API int luaL_ref(lua_State *L, int t)
+{
+  int ref;
+  t = abs_index(L, t);
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 1);  /* remove from stack */
+    return LUA_REFNIL;  /* `nil' has a unique fixed reference */
+  }
+  lua_rawgeti(L, t, FREELIST_REF);  /* get first free element */
+  ref = (int)lua_tointeger(L, -1);  /* ref = t[FREELIST_REF] */
+  lua_pop(L, 1);  /* remove it from stack */
+  if (ref != 0) {  /* any free element? */
+    lua_rawgeti(L, t, ref);  /* remove it from list */
+    lua_rawseti(L, t, FREELIST_REF);  /* (t[FREELIST_REF] = t[ref]) */
+  } else {  /* no free elements */
+    ref = (int)lua_objlen(L, t);
+    ref++;  /* create new reference */
+  }
+  lua_rawseti(L, t, ref);
+  return ref;
+}
+
+LUALIB_API void luaL_unref(lua_State *L, int t, int ref)
+{
+  if (ref >= 0) {
+    t = abs_index(L, t);
+    lua_rawgeti(L, t, FREELIST_REF);
+    lua_rawseti(L, t, ref);  /* t[ref] = t[FREELIST_REF] */
+    lua_pushinteger(L, ref);
+    lua_rawseti(L, t, FREELIST_REF);  /* t[FREELIST_REF] = ref */
+  }
+}
+
+/* -- Load Lua code ------------------------------------------------------- */
+
+typedef struct FileReaderCtx {
+  FILE *fp;
+  char buf[LUAL_BUFFERSIZE];
+} FileReaderCtx;
+
+static const char *reader_file(lua_State *L, void *ud, size_t *size)
+{
+  FileReaderCtx *ctx = (FileReaderCtx *)ud;
+  UNUSED(L);
+  if (feof(ctx->fp)) return NULL;
+  *size = fread(ctx->buf, 1, sizeof(ctx->buf), ctx->fp);
+  return *size > 0 ? ctx->buf : NULL;
+}
+
+LUALIB_API int luaL_loadfile(lua_State *L, const char *filename)
+{
+  FileReaderCtx ctx;
+  int status;
+  const char *chunkname;
+  if (filename) {
+    ctx.fp = fopen(filename, "r");
+    if (ctx.fp == NULL) {
+      lua_pushfstring(L, "cannot open %s: %s", filename, strerror(errno));
+      return LUA_ERRFILE;
+    }
+    chunkname = lua_pushfstring(L, "@%s", filename);
+  } else {
+    ctx.fp = stdin;
+    chunkname = "=stdin";
+  }
+  status = lua_load(L, reader_file, &ctx, chunkname);
+  if (ferror(ctx.fp)) {
+    L->top -= filename ? 2 : 1;
+    lua_pushfstring(L, "cannot read %s: %s", chunkname+1, strerror(errno));
+    if (filename)
+      fclose(ctx.fp);
+    return LUA_ERRFILE;
+  }
+  if (filename) {
+    L->top--;
+    copyTV(L, L->top-1, L->top);
+    fclose(ctx.fp);
+  }
+  return status;
+}
+
+typedef struct StringReaderCtx {
+  const char *str;
+  size_t size;
+} StringReaderCtx;
+
+static const char *reader_string(lua_State *L, void *ud, size_t *size)
+{
+  StringReaderCtx *ctx = (StringReaderCtx *)ud;
+  UNUSED(L);
+  if (ctx->size == 0) return NULL;
+  *size = ctx->size;
+  ctx->size = 0;
+  return ctx->str;
+}
+
+LUALIB_API int luaL_loadbuffer(lua_State *L, const char *buf, size_t size,
+			       const char *name)
+{
+  StringReaderCtx ctx;
+  ctx.str = buf;
+  ctx.size = size;
+  return lua_load(L, reader_string, &ctx, name);
+}
+
+LUALIB_API int luaL_loadstring(lua_State *L, const char *s)
+{
+  return luaL_loadbuffer(L, s, strlen(s), s);
+}
+
+/* -- Default allocator and panic function -------------------------------- */
+
+#ifdef LUAJIT_USE_SYSMALLOC
+
+static void *mem_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
+{
+  (void)ud;
+  (void)osize;
+  if (nsize == 0) {
+    free(ptr);
+    return NULL;
+  } else {
+    return realloc(ptr, nsize);
+  }
+}
+
+#define mem_create()	NULL
+
+#else
+
+#include "lj_alloc.h"
+
+#define mem_alloc	lj_alloc_f
+#define mem_create	lj_alloc_create
+
+#endif
+
+static int panic(lua_State *L)
+{
+  fprintf(stderr, "PANIC: unprotected error in call to Lua API (%s)\n",
+	  lua_tostring(L, -1));
+  return 0;
+}
+
+LUALIB_API lua_State *luaL_newstate(void)
+{
+  lua_State *L = lua_newstate(mem_alloc, mem_create());
+  if (L) G(L)->panic = panic;
+  return L;
+}
+

+ 560 - 0
src/lib_base.c

@@ -0,0 +1,560 @@
+/*
+** Base and coroutine library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#include <stdio.h>
+
+#define lib_base_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_meta.h"
+#include "lj_state.h"
+#include "lj_ff.h"
+#include "lj_ctype.h"
+#include "lj_lib.h"
+
+/* -- Base library: checks ------------------------------------------------ */
+
+#define LJLIB_MODULE_base
+
+LJLIB_ASM(assert)		LJLIB_REC(.)
+{
+  GCstr *s;
+  lj_lib_checkany(L, 1);
+  s = lj_lib_optstr(L, 2);
+  if (s)
+    lj_err_callermsg(L, strdata(s));
+  else
+    lj_err_caller(L, LJ_ERR_ASSERT);
+  return FFH_UNREACHABLE;
+}
+
+/* ORDER LJ_T */
+LJLIB_PUSH("nil")
+LJLIB_PUSH("boolean")
+LJLIB_PUSH(top-1)  /* boolean */
+LJLIB_PUSH("userdata")
+LJLIB_PUSH("string")
+LJLIB_PUSH("upval")
+LJLIB_PUSH("thread")
+LJLIB_PUSH("proto")
+LJLIB_PUSH("function")
+LJLIB_PUSH("deadkey")
+LJLIB_PUSH("table")
+LJLIB_PUSH(top-8)  /* userdata */
+LJLIB_PUSH("number")
+LJLIB_ASM_(type)		LJLIB_REC(.)
+/* Recycle the lj_lib_checkany(L, 1) from assert. */
+
+/* -- Base library: getters and setters ----------------------------------- */
+
+LJLIB_ASM_(getmetatable)	LJLIB_REC(.)
+/* Recycle the lj_lib_checkany(L, 1) from assert. */
+
+LJLIB_ASM(setmetatable)		LJLIB_REC(.)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  GCtab *mt = lj_lib_checktabornil(L, 2);
+  if (!tvisnil(lj_meta_lookup(L, L->base, MM_metatable)))
+    lj_err_caller(L, LJ_ERR_PROTMT);
+  setgcref(t->metatable, obj2gco(mt));
+  if (mt) { lj_gc_objbarriert(L, t, mt); }
+  settabV(L, L->base-1, t);
+  return FFH_RES(1);
+}
+
+LJLIB_CF(getfenv)
+{
+  GCfunc *fn;
+  cTValue *o = L->base;
+  if (!(o < L->top && tvisfunc(o))) {
+    int level = lj_lib_optint(L, 1, 1);
+    o = lj_err_getframe(L, level, &level);
+    if (o == NULL)
+      lj_err_arg(L, 1, LJ_ERR_INVLVL);
+  }
+  fn = &gcval(o)->fn;
+  settabV(L, L->top++, isluafunc(fn) ? tabref(fn->l.env) : tabref(L->env));
+  return 1;
+}
+
+LJLIB_CF(setfenv)
+{
+  GCfunc *fn;
+  GCtab *t = lj_lib_checktab(L, 2);
+  cTValue *o = L->base;
+  if (!(o < L->top && tvisfunc(o))) {
+    int level = lj_lib_checkint(L, 1);
+    if (level == 0) {
+      /* NOBARRIER: A thread (i.e. L) is never black. */
+      setgcref(L->env, obj2gco(t));
+      return 0;
+    }
+    o = lj_err_getframe(L, level, &level);
+    if (o == NULL)
+      lj_err_arg(L, 1, LJ_ERR_INVLVL);
+  }
+  fn = &gcval(o)->fn;
+  if (!isluafunc(fn))
+    lj_err_caller(L, LJ_ERR_SETFENV);
+  setgcref(fn->l.env, obj2gco(t));
+  lj_gc_objbarrier(L, obj2gco(fn), t);
+  setfuncV(L, L->top++, fn);
+  return 1;
+}
+
+LJLIB_ASM(rawget)		LJLIB_REC(.)
+{
+  lj_lib_checktab(L, 1);
+  lj_lib_checkany(L, 2);
+  return FFH_UNREACHABLE;
+}
+
+LJLIB_CF(rawset)		LJLIB_REC(.)
+{
+  lj_lib_checktab(L, 1);
+  lj_lib_checkany(L, 2);
+  L->top = 1+lj_lib_checkany(L, 3);
+  lua_rawset(L, 1);
+  return 1;
+}
+
+LJLIB_CF(rawequal)		LJLIB_REC(.)
+{
+  cTValue *o1 = lj_lib_checkany(L, 1);
+  cTValue *o2 = lj_lib_checkany(L, 2);
+  setboolV(L->top-1, lj_obj_equal(o1, o2));
+  return 1;
+}
+
+LJLIB_CF(unpack)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  int32_t n, i = lj_lib_optint(L, 2, 1);
+  int32_t e = (L->base+3-1 < L->top && !tvisnil(L->base+3-1)) ?
+	      lj_lib_checkint(L, 3) : (int32_t)lj_tab_len(t);
+  if (i > e) return 0;
+  n = e - i + 1;
+  if (n <= 0 || !lua_checkstack(L, n))
+    lj_err_caller(L, LJ_ERR_UNPACK);
+  do {
+    cTValue *tv = lj_tab_getint(t, i);
+    if (tv) {
+      copyTV(L, L->top++, tv);
+    } else {
+      setnilV(L->top++);
+    }
+  } while (i++ < e);
+  return n;
+}
+
+LJLIB_CF(select)
+{
+  int32_t n = (int32_t)(L->top - L->base);
+  if (n >= 1 && tvisstr(L->base) && *strVdata(L->base) == '#') {
+    setintV(L->top-1, n-1);
+    return 1;
+  } else {
+    int32_t i = lj_lib_checkint(L, 1);
+    if (i < 0) i = n + i; else if (i > n) i = n;
+    if (i < 1)
+      lj_err_arg(L, 1, LJ_ERR_IDXRNG);
+    return n - i;
+  }
+}
+
+/* -- Base library: conversions ------------------------------------------- */
+
+LJLIB_ASM(tonumber)		LJLIB_REC(.)
+{
+  int32_t base = lj_lib_optint(L, 2, 10);
+  if (base == 10) {
+    TValue *o = lj_lib_checkany(L, 1);
+    if (tvisnum(o) || (tvisstr(o) && lj_str_numconv(strVdata(o), o))) {
+      setnumV(L->base-1, numV(o));
+      return FFH_RES(1);
+    }
+  } else {
+    const char *p = strdata(lj_lib_checkstr(L, 1));
+    char *ep;
+    unsigned long ul;
+    if (base < 2 || base > 36)
+      lj_err_arg(L, 2, LJ_ERR_BASERNG);
+    ul = strtoul(p, &ep, base);
+    if (p != ep) {
+      while (lj_ctype_isspace((unsigned char)(*ep))) ep++;
+      if (*ep == '\0') {
+	setnumV(L->base-1, cast_num(ul));
+	return FFH_RES(1);
+      }
+    }
+  }
+  setnilV(L->base-1);
+  return FFH_RES(1);
+}
+
+LJLIB_ASM(tostring)		LJLIB_REC(.)
+{
+  TValue *o = lj_lib_checkany(L, 1);
+  cTValue *mo;
+  L->top = o+1;  /* Only keep one argument. */
+  if (!tvisnil(mo = lj_meta_lookup(L, o, MM_tostring))) {
+    copyTV(L, L->base-1, mo);  /* Replace callable. */
+    return FFH_RETRY;
+  } else {
+    GCstr *s;
+    if (tvisnum(o)) {
+      s = lj_str_fromnum(L, &o->n);
+    } else if (tvisnil(o)) {
+      s = lj_str_newlit(L, "nil");
+    } else if (tvisfalse(o)) {
+      s = lj_str_newlit(L, "false");
+    } else if (tvistrue(o)) {
+      s = lj_str_newlit(L, "true");
+    } else {
+      if (tvisfunc(o) && isffunc(funcV(o)))
+	lua_pushfstring(L, "function: fast#%d", funcV(o)->c.ffid);
+      else
+	lua_pushfstring(L, "%s: %p", typename(o), lua_topointer(L, 1));
+      /* Note: lua_pushfstring calls the GC which may invalidate o. */
+      s = strV(L->top-1);
+    }
+    setstrV(L, L->base-1, s);
+    return FFH_RES(1);
+  }
+}
+
+/* -- Base library: iterators --------------------------------------------- */
+
+LJLIB_ASM(next)
+{
+  lj_lib_checktab(L, 1);
+  lj_lib_checknum(L, 2);  /* For ipairs_aux. */
+  return FFH_UNREACHABLE;
+}
+
+LJLIB_PUSH(lastcl)
+LJLIB_ASM_(pairs)
+
+LJLIB_NOREGUV LJLIB_ASM_(ipairs_aux)	LJLIB_REC(.)
+
+LJLIB_PUSH(lastcl)
+LJLIB_ASM_(ipairs)		LJLIB_REC(.)
+
+/* -- Base library: throw and catch errors -------------------------------- */
+
+LJLIB_CF(error)
+{
+  int32_t level = lj_lib_optint(L, 2, 1);
+  lua_settop(L, 1);
+  if (lua_isstring(L, 1) && level > 0) {
+    luaL_where(L, level);
+    lua_pushvalue(L, 1);
+    lua_concat(L, 2);
+  }
+  return lua_error(L);
+}
+
+LJLIB_ASM(pcall)		LJLIB_REC(.)
+{
+  lj_lib_checkany(L, 1);
+  lj_lib_checkfunc(L, 2);  /* For xpcall only. */
+  return FFH_UNREACHABLE;
+}
+LJLIB_ASM_(xpcall)		LJLIB_REC(.)
+
+/* -- Base library: load Lua code ----------------------------------------- */
+
+static int load_aux(lua_State *L, int status)
+{
+  if (status == 0)
+    return 1;
+  copyTV(L, L->top, L->top-1);
+  setnilV(L->top-1);
+  L->top++;
+  return 2;
+}
+
+LJLIB_CF(loadstring)
+{
+  GCstr *s = lj_lib_checkstr(L, 1);
+  GCstr *name = lj_lib_optstr(L, 2);
+  return load_aux(L,
+	   luaL_loadbuffer(L, strdata(s), s->len, strdata(name ? name : s)));
+}
+
+LJLIB_CF(loadfile)
+{
+  GCstr *fname = lj_lib_optstr(L, 1);
+  return load_aux(L, luaL_loadfile(L, fname ? strdata(fname) : NULL));
+}
+
+static const char *reader_func(lua_State *L, void *ud, size_t *size)
+{
+  UNUSED(ud);
+  luaL_checkstack(L, 2, "too many nested functions");
+  copyTV(L, L->top++, L->base);
+  lua_call(L, 0, 1);  /* Call user-supplied function. */
+  L->top--;
+  if (tvisnil(L->top)) {
+    *size = 0;
+    return NULL;
+  } else if (tvisstr(L->top) || tvisnum(L->top)) {
+    copyTV(L, L->base+2, L->top);  /* Anchor string in reserved stack slot. */
+    return lua_tolstring(L, 3, size);
+  } else {
+    lj_err_caller(L, LJ_ERR_RDRSTR);
+    return NULL;
+  }
+}
+
+LJLIB_CF(load)
+{
+  GCstr *name = lj_lib_optstr(L, 2);
+  lj_lib_checkfunc(L, 1);
+  lua_settop(L, 3);  /* Reserve a slot for the string from the reader. */
+  return load_aux(L,
+	   lua_load(L, reader_func, NULL, name ? strdata(name) : "=(load)"));
+}
+
+LJLIB_CF(dofile)
+{
+  GCstr *fname = lj_lib_optstr(L, 1);
+  setnilV(L->top);
+  L->top = L->base+1;
+  if (luaL_loadfile(L, fname ? strdata(fname) : NULL) != 0)
+    lua_error(L);
+  lua_call(L, 0, LUA_MULTRET);
+  return (L->top - L->base) - 1;
+}
+
+/* -- Base library: GC control -------------------------------------------- */
+
+LJLIB_CF(gcinfo)
+{
+  setintV(L->top++, (G(L)->gc.total >> 10));
+  return 1;
+}
+
+LJLIB_CF(collectgarbage)
+{
+  int opt = lj_lib_checkopt(L, 1, LUA_GCCOLLECT,  /* ORDER LUA_GC* */
+    "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul");
+  int32_t data = lj_lib_optint(L, 2, 0);
+  if (opt == LUA_GCCOUNT) {
+    setnumV(L->top-1, cast_num((int32_t)G(L)->gc.total)/1024.0);
+  } else {
+    int res = lua_gc(L, opt, data);
+    if (opt == LUA_GCSTEP)
+      setboolV(L->top-1, res);
+    else
+      setintV(L->top-1, res);
+  }
+  return 1;
+}
+
+/* -- Base library: miscellaneous functions ------------------------------- */
+
+LJLIB_PUSH(top-2)  /* Upvalue holds weak table. */
+LJLIB_CF(newproxy)
+{
+  lua_settop(L, 1);
+  lua_newuserdata(L, 0);
+  if (lua_toboolean(L, 1) == 0) {  /* newproxy(): without metatable. */
+    return 1;
+  } else if (lua_isboolean(L, 1)) {  /* newproxy(true): with metatable. */
+    lua_newtable(L);
+    lua_pushvalue(L, -1);
+    lua_pushboolean(L, 1);
+    lua_rawset(L, lua_upvalueindex(1));  /* Remember mt in weak table. */
+  } else {  /* newproxy(proxy): inherit metatable. */
+    int validproxy = 0;
+    if (lua_getmetatable(L, 1)) {
+      lua_rawget(L, lua_upvalueindex(1));
+      validproxy = lua_toboolean(L, -1);
+      lua_pop(L, 1);
+    }
+    if (!validproxy)
+      lj_err_arg(L, 1, LJ_ERR_NOPROXY);
+    lua_getmetatable(L, 1);
+  }
+  lua_setmetatable(L, 2);
+  return 1;
+}
+
+LJLIB_PUSH("tostring")
+LJLIB_CF(print)
+{
+  ptrdiff_t i, nargs = L->top - L->base;
+  cTValue *tv = lj_tab_getstr(tabref(L->env), strV(lj_lib_upvalue(L, 1)));
+  int shortcut = (tv && tvisfunc(tv) && funcV(tv)->c.ffid == FF_tostring);
+  copyTV(L, L->top++, tv ? tv : niltv(L));
+  for (i = 0; i < nargs; i++) {
+    const char *str;
+    size_t size;
+    cTValue *o = &L->base[i];
+    if (shortcut && tvisstr(o)) {
+      str = strVdata(o);
+      size = strV(o)->len;
+    } else if (shortcut && tvisnum(o)) {
+      char buf[LUAI_MAXNUMBER2STR];
+      lua_Number n = numV(o);
+      size = (size_t)lua_number2str(buf, n);
+      str = buf;
+    } else {
+      copyTV(L, L->top+1, o);
+      copyTV(L, L->top, L->top-1);
+      L->top += 2;
+      lua_call(L, 1, 1);
+      str = lua_tolstring(L, -1, &size);
+      if (!str)
+	lj_err_caller(L, LJ_ERR_PRTOSTR);
+      L->top--;
+    }
+    if (i)
+      putchar('\t');
+    fwrite(str, 1, size, stdout);
+  }
+  putchar('\n');
+  return 0;
+}
+
+LJLIB_PUSH(top-3)
+LJLIB_SET(_VERSION)
+
+#include "lj_libdef.h"
+
+/* -- Coroutine library --------------------------------------------------- */
+
+#define LJLIB_MODULE_coroutine
+
+LJLIB_CF(coroutine_status)
+{
+  const char *s;
+  lua_State *co;
+  if (!(L->top > L->base && tvisthread(L->base)))
+    lj_err_arg(L, 1, LJ_ERR_NOCORO);
+  co = threadV(L->base);
+  if (co == L) s = "running";
+  else if (co->status == LUA_YIELD) s = "suspended";
+  else if (co->status != 0) s = "dead";
+  else if (co->base > co->stack+1) s = "normal";
+  else if (co->top == co->base) s = "dead";
+  else s = "suspended";
+  lua_pushstring(L, s);
+  return 1;
+}
+
+LJLIB_CF(coroutine_running)
+{
+  if (lua_pushthread(L))
+    setnilV(L->top++);
+  return 1;
+}
+
+LJLIB_CF(coroutine_create)
+{
+  lua_State *L1 = lua_newthread(L);
+  if (!(L->top > L->base && tvisfunc(L->base) && isluafunc(funcV(L->base))))
+    lj_err_arg(L, 1, LJ_ERR_NOLFUNC);
+  setfuncV(L, L1->top++, funcV(L->base));
+  return 1;
+}
+
+LJLIB_ASM(coroutine_yield)
+{
+  lj_err_caller(L, LJ_ERR_CYIELD);
+  return FFH_UNREACHABLE;
+}
+
+static int ffh_resume(lua_State *L, lua_State *co, int wrap)
+{
+  if (co->cframe != NULL || co->status > LUA_YIELD ||
+      (co->status == 0 && co->top == co->base)) {
+    ErrMsg em = co->cframe ? LJ_ERR_CORUN : LJ_ERR_CODEAD;
+    if (wrap) lj_err_caller(L, em);
+    setboolV(L->base-1, 0);
+    setstrV(L, L->base, lj_err_str(L, em));
+    return FFH_RES(2);
+  }
+  lj_state_growstack(co, (MSize)(L->top - L->base - 1));
+  return FFH_RETRY;
+}
+
+LJLIB_ASM(coroutine_resume)
+{
+  if (!(L->top > L->base && tvisthread(L->base)))
+    lj_err_arg(L, 1, LJ_ERR_NOCORO);
+  return ffh_resume(L, threadV(L->base), 0);
+}
+
+LJLIB_NOREG LJLIB_ASM(coroutine_wrap_aux)
+{
+  return ffh_resume(L, threadV(lj_lib_upvalue(L, 1)), 1);
+}
+
+/* Inline declarations. */
+LJ_ASMF void lj_ff_coroutine_wrap_aux(void);
+LJ_FUNCA_NORET void lj_ffh_coroutine_wrap_err(lua_State *L, lua_State *co);
+
+/* Error handler, called from assembler VM. */
+void lj_ffh_coroutine_wrap_err(lua_State *L, lua_State *co)
+{
+  co->top--; copyTV(L, L->top, co->top); L->top++;
+  if (tvisstr(L->top-1))
+    lj_err_callermsg(L, strVdata(L->top-1));
+  else
+    lj_err_run(L);
+}
+
+LJLIB_CF(coroutine_wrap)
+{
+  GCfunc *fn;
+  lj_cf_coroutine_create(L);
+  lua_pushcclosure(L, lj_ffh_coroutine_wrap_aux, 1);
+  fn = funcV(L->top-1);
+  fn->c.gate = lj_ff_coroutine_wrap_aux;
+  fn->c.ffid = FF_coroutine_wrap_aux;
+  return 1;
+}
+
+#include "lj_libdef.h"
+
+/* ------------------------------------------------------------------------ */
+
+static void newproxy_weaktable(lua_State *L)
+{
+  /* NOBARRIER: The table is new (marked white). */
+  GCtab *t = lj_tab_new(L, 0, 1);
+  settabV(L, L->top++, t);
+  setgcref(t->metatable, obj2gco(t));
+  setstrV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "__mode")),
+	    lj_str_newlit(L, "kv"));
+  t->nomm = cast_byte(~(1u<<MM_mode));
+}
+
+LUALIB_API int luaopen_base(lua_State *L)
+{
+  /* NOBARRIER: Table and value are the same. */
+  GCtab *env = tabref(L->env);
+  settabV(L, lj_tab_setstr(L, env, lj_str_newlit(L, "_G")), env);
+  lua_pushliteral(L, LUA_VERSION);  /* top-3. */
+  newproxy_weaktable(L);  /* top-2. */
+  LJ_LIB_REG_(L, "_G", base);
+  LJ_LIB_REG(L, coroutine);
+  return 2;
+}
+

+ 74 - 0
src/lib_bit.c

@@ -0,0 +1,74 @@
+/*
+** Bit manipulation library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lib_bit_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_bit
+
+LJLIB_ASM(bit_tobit)		LJLIB_REC(bit_unary IR_TOBIT)
+{
+  lj_lib_checknum(L, 1);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(bit_bnot)		LJLIB_REC(bit_unary IR_BNOT)
+LJLIB_ASM_(bit_bswap)		LJLIB_REC(bit_unary IR_BSWAP)
+
+LJLIB_ASM(bit_lshift)		LJLIB_REC(bit_shift IR_BSHL)
+{
+  lj_lib_checknum(L, 1);
+  lj_lib_checknum(L, 2);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(bit_rshift)		LJLIB_REC(bit_shift IR_BSHR)
+LJLIB_ASM_(bit_arshift)		LJLIB_REC(bit_shift IR_BSAR)
+LJLIB_ASM_(bit_rol)		LJLIB_REC(bit_shift IR_BROL)
+LJLIB_ASM_(bit_ror)		LJLIB_REC(bit_shift IR_BROR)
+
+LJLIB_ASM(bit_band)		LJLIB_REC(bit_nary IR_BAND)
+{
+  int i = 0;
+  do { lj_lib_checknum(L, ++i); } while (L->base+i < L->top);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(bit_bor)		LJLIB_REC(bit_nary IR_BOR)
+LJLIB_ASM_(bit_bxor)		LJLIB_REC(bit_nary IR_BXOR)
+
+/* ------------------------------------------------------------------------ */
+
+LJLIB_CF(bit_tohex)
+{
+  uint32_t b = (uint32_t)lj_num2bit(lj_lib_checknum(L, 1));
+  int32_t i, n = L->base+1 >= L->top ? 8 : lj_num2bit(lj_lib_checknum(L, 2));
+  const char *hexdigits = "0123456789abcdef";
+  char buf[8];
+  if (n < 0) { n = -n; hexdigits = "0123456789ABCDEF"; }
+  if (n > 8) n = 8;
+  for (i = n; --i >= 0; ) { buf[i] = hexdigits[b & 15]; b >>= 4; }
+  lua_pushlstring(L, buf, (size_t)n);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_bit(lua_State *L)
+{
+  LJ_LIB_REG(L, bit);
+  return 1;
+}
+

+ 366 - 0
src/lib_debug.c

@@ -0,0 +1,366 @@
+/*
+** Debug library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lib_debug_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_debug
+
+LJLIB_CF(debug_getregistry)
+{
+  copyTV(L, L->top++, registry(L));
+  return 1;
+}
+
+LJLIB_CF(debug_getmetatable)
+{
+  lj_lib_checkany(L, 1);
+  if (!lua_getmetatable(L, 1)) {
+    setnilV(L->top-1);
+  }
+  return 1;
+}
+
+LJLIB_CF(debug_setmetatable)
+{
+  lj_lib_checktabornil(L, 2);
+  L->top = L->base+2;
+  lua_setmetatable(L, 1);
+  setboolV(L->top-1, 1);
+  return 1;
+}
+
+LJLIB_CF(debug_getfenv)
+{
+  lj_lib_checkany(L, 1);
+  lua_getfenv(L, 1);
+  return 1;
+}
+
+LJLIB_CF(debug_setfenv)
+{
+  lj_lib_checktab(L, 2);
+  L->top = L->base+2;
+  if (!lua_setfenv(L, 1))
+    lj_err_caller(L, LJ_ERR_SETFENV);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static void settabss(lua_State *L, const char *i, const char *v)
+{
+  lua_pushstring(L, v);
+  lua_setfield(L, -2, i);
+}
+
+static void settabsi(lua_State *L, const char *i, int v)
+{
+  lua_pushinteger(L, v);
+  lua_setfield(L, -2, i);
+}
+
+static lua_State *getthread(lua_State *L, int *arg)
+{
+  if (L->base < L->top && tvisthread(L->base)) {
+    *arg = 1;
+    return threadV(L->base);
+  } else {
+    *arg = 0;
+    return L;
+  }
+}
+
+static void treatstackoption(lua_State *L, lua_State *L1, const char *fname)
+{
+  if (L == L1) {
+    lua_pushvalue(L, -2);
+    lua_remove(L, -3);
+  }
+  else
+    lua_xmove(L1, L, 1);
+  lua_setfield(L, -2, fname);
+}
+
+LJLIB_CF(debug_getinfo)
+{
+  lua_Debug ar;
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  const char *options = luaL_optstring(L, arg+2, "flnSu");
+  if (lua_isnumber(L, arg+1)) {
+    if (!lua_getstack(L1, (int)lua_tointeger(L, arg+1), &ar)) {
+      setnilV(L->top-1);
+      return 1;
+    }
+  } else if (L->base+arg < L->top && tvisfunc(L->base+arg)) {
+    options = lua_pushfstring(L, ">%s", options);
+    setfuncV(L1, L1->top++, funcV(L->base+arg));
+  } else {
+    lj_err_arg(L, arg+1, LJ_ERR_NOFUNCL);
+  }
+  if (!lua_getinfo(L1, options, &ar))
+    lj_err_arg(L, arg+2, LJ_ERR_INVOPT);
+  lua_createtable(L, 0, 16);
+  if (strchr(options, 'S')) {
+    settabss(L, "source", ar.source);
+    settabss(L, "short_src", ar.short_src);
+    settabsi(L, "linedefined", ar.linedefined);
+    settabsi(L, "lastlinedefined", ar.lastlinedefined);
+    settabss(L, "what", ar.what);
+  }
+  if (strchr(options, 'l'))
+    settabsi(L, "currentline", ar.currentline);
+  if (strchr(options, 'u'))
+    settabsi(L, "nups", ar.nups);
+  if (strchr(options, 'n')) {
+    settabss(L, "name", ar.name);
+    settabss(L, "namewhat", ar.namewhat);
+  }
+  if (strchr(options, 'L'))
+    treatstackoption(L, L1, "activelines");
+  if (strchr(options, 'f'))
+    treatstackoption(L, L1, "func");
+  return 1;  /* return table */
+}
+
+LJLIB_CF(debug_getlocal)
+{
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  lua_Debug ar;
+  const char *name;
+  if (!lua_getstack(L1, lj_lib_checkint(L, arg+1), &ar))
+    lj_err_arg(L, arg+1, LJ_ERR_LVLRNG);
+  name = lua_getlocal(L1, &ar, lj_lib_checkint(L, arg+2));
+  if (name) {
+    lua_xmove(L1, L, 1);
+    lua_pushstring(L, name);
+    lua_pushvalue(L, -2);
+    return 2;
+  } else {
+    setnilV(L->top-1);
+    return 1;
+  }
+}
+
+LJLIB_CF(debug_setlocal)
+{
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  lua_Debug ar;
+  TValue *tv;
+  if (!lua_getstack(L1, lj_lib_checkint(L, arg+1), &ar))
+    lj_err_arg(L, arg+1, LJ_ERR_LVLRNG);
+  tv = lj_lib_checkany(L, arg+3);
+  copyTV(L1, L1->top++, tv);
+  lua_pushstring(L, lua_setlocal(L1, &ar, lj_lib_checkint(L, arg+2)));
+  return 1;
+}
+
+static int debug_getupvalue(lua_State *L, int get)
+{
+  int32_t n = lj_lib_checkint(L, 2);
+  if (isluafunc(lj_lib_checkfunc(L, 1))) {
+    const char *name = get ? lua_getupvalue(L, 1, n) : lua_setupvalue(L, 1, n);
+    if (name) {
+      lua_pushstring(L, name);
+      if (!get) return 1;
+      copyTV(L, L->top, L->top-2);
+      L->top++;
+      return 2;
+    }
+  }
+  return 0;
+}
+
+LJLIB_CF(debug_getupvalue)
+{
+  return debug_getupvalue(L, 1);
+}
+
+LJLIB_CF(debug_setupvalue)
+{
+  lj_lib_checkany(L, 3);
+  return debug_getupvalue(L, 0);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const char KEY_HOOK = 'h';
+
+static void hookf(lua_State *L, lua_Debug *ar)
+{
+  static const char *const hooknames[] =
+    {"call", "return", "line", "count", "tail return"};
+  lua_pushlightuserdata(L, (void *)&KEY_HOOK);
+  lua_rawget(L, LUA_REGISTRYINDEX);
+  if (lua_isfunction(L, -1)) {
+    lua_pushstring(L, hooknames[(int)ar->event]);
+    if (ar->currentline >= 0)
+      lua_pushinteger(L, ar->currentline);
+    else lua_pushnil(L);
+    lua_call(L, 2, 0);
+  }
+}
+
+static int makemask(const char *smask, int count)
+{
+  int mask = 0;
+  if (strchr(smask, 'c')) mask |= LUA_MASKCALL;
+  if (strchr(smask, 'r')) mask |= LUA_MASKRET;
+  if (strchr(smask, 'l')) mask |= LUA_MASKLINE;
+  if (count > 0) mask |= LUA_MASKCOUNT;
+  return mask;
+}
+
+static char *unmakemask(int mask, char *smask)
+{
+  int i = 0;
+  if (mask & LUA_MASKCALL) smask[i++] = 'c';
+  if (mask & LUA_MASKRET) smask[i++] = 'r';
+  if (mask & LUA_MASKLINE) smask[i++] = 'l';
+  smask[i] = '\0';
+  return smask;
+}
+
+LJLIB_CF(debug_sethook)
+{
+  int arg, mask, count;
+  lua_Hook func;
+  (void)getthread(L, &arg);
+  if (lua_isnoneornil(L, arg+1)) {
+    lua_settop(L, arg+1);
+    func = NULL; mask = 0; count = 0;  /* turn off hooks */
+  } else {
+    const char *smask = luaL_checkstring(L, arg+2);
+    luaL_checktype(L, arg+1, LUA_TFUNCTION);
+    count = luaL_optint(L, arg+3, 0);
+    func = hookf; mask = makemask(smask, count);
+  }
+  lua_pushlightuserdata(L, (void *)&KEY_HOOK);
+  lua_pushvalue(L, arg+1);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+  lua_sethook(L, func, mask, count);
+  return 0;
+}
+
+LJLIB_CF(debug_gethook)
+{
+  char buff[5];
+  int mask = lua_gethookmask(L);
+  lua_Hook hook = lua_gethook(L);
+  if (hook != NULL && hook != hookf) {  /* external hook? */
+    lua_pushliteral(L, "external hook");
+  } else {
+    lua_pushlightuserdata(L, (void *)&KEY_HOOK);
+    lua_rawget(L, LUA_REGISTRYINDEX);   /* get hook */
+  }
+  lua_pushstring(L, unmakemask(mask, buff));
+  lua_pushinteger(L, lua_gethookcount(L));
+  return 3;
+}
+
+/* ------------------------------------------------------------------------ */
+
+LJLIB_CF(debug_debug)
+{
+  for (;;) {
+    char buffer[250];
+    fputs("lua_debug> ", stderr);
+    if (fgets(buffer, sizeof(buffer), stdin) == 0 ||
+	strcmp(buffer, "cont\n") == 0)
+      return 0;
+    if (luaL_loadbuffer(L, buffer, strlen(buffer), "=(debug command)") ||
+	lua_pcall(L, 0, 0, 0)) {
+      fputs(lua_tostring(L, -1), stderr);
+      fputs("\n", stderr);
+    }
+    lua_settop(L, 0);  /* remove eventual returns */
+  }
+}
+
+/* ------------------------------------------------------------------------ */
+
+#define LEVELS1	12	/* size of the first part of the stack */
+#define LEVELS2	10	/* size of the second part of the stack */
+
+LJLIB_CF(debug_traceback)
+{
+  int level;
+  int firstpart = 1;  /* still before eventual `...' */
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  lua_Debug ar;
+  if (lua_isnumber(L, arg+2)) {
+    level = (int)lua_tointeger(L, arg+2);
+    lua_pop(L, 1);
+  }
+  else
+    level = (L == L1) ? 1 : 0;  /* level 0 may be this own function */
+  if (lua_gettop(L) == arg)
+    lua_pushliteral(L, "");
+  else if (!lua_isstring(L, arg+1)) return 1;  /* message is not a string */
+  else lua_pushliteral(L, "\n");
+  lua_pushliteral(L, "stack traceback:");
+  while (lua_getstack(L1, level++, &ar)) {
+    if (level > LEVELS1 && firstpart) {
+      /* no more than `LEVELS2' more levels? */
+      if (!lua_getstack(L1, level+LEVELS2, &ar)) {
+	level--;  /* keep going */
+      } else {
+	lua_pushliteral(L, "\n\t...");  /* too many levels */
+	/* This only works with LuaJIT 2.x. Avoids O(n^2) behaviour. */
+	lua_getstack(L1, -10, &ar);
+	level = ar.i_ci - LEVELS2;
+      }
+      firstpart = 0;
+      continue;
+    }
+    lua_pushliteral(L, "\n\t");
+    lua_getinfo(L1, "Snl", &ar);
+    lua_pushfstring(L, "%s:", ar.short_src);
+    if (ar.currentline > 0)
+      lua_pushfstring(L, "%d:", ar.currentline);
+    if (*ar.namewhat != '\0') {  /* is there a name? */
+      lua_pushfstring(L, " in function " LUA_QS, ar.name);
+    } else {
+      if (*ar.what == 'm')  /* main? */
+	lua_pushfstring(L, " in main chunk");
+      else if (*ar.what == 'C' || *ar.what == 't')
+	lua_pushliteral(L, " ?");  /* C function or tail call */
+      else
+	lua_pushfstring(L, " in function <%s:%d>",
+			ar.short_src, ar.linedefined);
+    }
+    lua_concat(L, lua_gettop(L) - arg);
+  }
+  lua_concat(L, lua_gettop(L) - arg);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_debug(lua_State *L)
+{
+  LJ_LIB_REG(L, debug);
+  return 1;
+}
+

+ 37 - 0
src/lib_init.c

@@ -0,0 +1,37 @@
+/*
+** Library initialization.
+** Major parts taken verbatim from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lib_init_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+static const luaL_Reg lualibs[] = {
+  { "",			luaopen_base },
+  { LUA_LOADLIBNAME,	luaopen_package },
+  { LUA_TABLIBNAME,	luaopen_table },
+  { LUA_IOLIBNAME,	luaopen_io },
+  { LUA_OSLIBNAME,	luaopen_os },
+  { LUA_STRLIBNAME,	luaopen_string },
+  { LUA_MATHLIBNAME,	luaopen_math },
+  { LUA_DBLIBNAME,	luaopen_debug },
+  { LUA_BITLIBNAME,	luaopen_bit },
+  { LUA_JITLIBNAME,	luaopen_jit },
+  { NULL,		NULL }
+};
+
+LUALIB_API void luaL_openlibs(lua_State *L)
+{
+  const luaL_Reg *lib = lualibs;
+  for (; lib->func; lib++) {
+    lua_pushcfunction(L, lib->func);
+    lua_pushstring(L, lib->name);
+    lua_call(L, 1, 0);
+  }
+}
+

+ 538 - 0
src/lib_io.c

@@ -0,0 +1,538 @@
+/*
+** I/O library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#include <errno.h>
+#include <stdio.h>
+
+#define lib_io_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_gc.h"
+#include "lj_ff.h"
+#include "lj_lib.h"
+
+/* Index of standard handles in function environment. */
+#define IO_INPUT	1
+#define IO_OUTPUT	2
+
+/* -- Error handling ------------------------------------------------------ */
+
+static int io_pushresult(lua_State *L, int ok, const char *fname)
+{
+  if (ok) {
+    setboolV(L->top++, 1);
+    return 1;
+  } else {
+    int en = errno;  /* Lua API calls may change this value. */
+    lua_pushnil(L);
+    if (fname)
+      lua_pushfstring(L, "%s: %s", fname, strerror(en));
+    else
+      lua_pushfstring(L, "%s", strerror(en));
+    lua_pushinteger(L, en);
+    return 3;
+  }
+}
+
+static void io_file_error(lua_State *L, int arg, const char *fname)
+{
+  lua_pushfstring(L, "%s: %s", fname, strerror(errno));
+  luaL_argerror(L, arg, lua_tostring(L, -1));
+}
+
+/* -- Open helpers -------------------------------------------------------- */
+
+#define io_tofilep(L)	((FILE **)luaL_checkudata(L, 1, LUA_FILEHANDLE))
+
+static FILE *io_tofile(lua_State *L)
+{
+  FILE **f = io_tofilep(L);
+  if (*f == NULL)
+    lj_err_caller(L, LJ_ERR_IOCLFL);
+  return *f;
+}
+
+static FILE **io_file_new(lua_State *L)
+{
+  FILE **pf = (FILE **)lua_newuserdata(L, sizeof(FILE *));
+  *pf = NULL;
+  luaL_getmetatable(L, LUA_FILEHANDLE);
+  lua_setmetatable(L, -2);
+  return pf;
+}
+
+/* -- Close helpers ------------------------------------------------------- */
+
+static int lj_cf_io_std_close(lua_State *L)
+{
+  lua_pushnil(L);
+  lua_pushliteral(L, "cannot close standard file");
+  return 2;
+}
+
+static int lj_cf_io_pipe_close(lua_State *L)
+{
+  FILE **p = io_tofilep(L);
+#if defined(LUA_USE_POSIX)
+  int ok = (pclose(*p) != -1);
+#elif defined(LUA_USE_WIN)
+  int ok = (_pclose(*p) != -1);
+#else
+  int ok = 0;
+#endif
+  *p = NULL;
+  return io_pushresult(L, ok, NULL);
+}
+
+static int lj_cf_io_file_close(lua_State *L)
+{
+  FILE **p = io_tofilep(L);
+  int ok = (fclose(*p) == 0);
+  *p = NULL;
+  return io_pushresult(L, ok, NULL);
+}
+
+static int io_file_close(lua_State *L)
+{
+  lua_getfenv(L, 1);
+  lua_getfield(L, -1, "__close");
+  return (lua_tocfunction(L, -1))(L);
+}
+
+/* -- Read/write helpers -------------------------------------------------- */
+
+static int io_file_readnum(lua_State *L, FILE *fp)
+{
+  lua_Number d;
+  if (fscanf(fp, LUA_NUMBER_SCAN, &d) == 1) {
+    lua_pushnumber(L, d);
+    return 1;
+  } else {
+    return 0;  /* read fails */
+  }
+}
+
+static int test_eof(lua_State *L, FILE *fp)
+{
+  int c = getc(fp);
+  ungetc(c, fp);
+  lua_pushlstring(L, NULL, 0);
+  return (c != EOF);
+}
+
+static int io_file_readline(lua_State *L, FILE *fp)
+{
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  for (;;) {
+    size_t len;
+    char *p = luaL_prepbuffer(&b);
+    if (fgets(p, LUAL_BUFFERSIZE, fp) == NULL) {  /* EOF? */
+      luaL_pushresult(&b);
+      return (strV(L->top-1)->len > 0);  /* Anything read? */
+    }
+    len = strlen(p);
+    if (len == 0 || p[len-1] != '\n') {  /* Partial line? */
+      luaL_addsize(&b, len);
+    } else {
+      luaL_addsize(&b, len - 1);  /* Don't include EOL. */
+      luaL_pushresult(&b);
+      return 1;  /* Got at least an EOL. */
+    }
+  }
+}
+
+static int io_file_readchars(lua_State *L, FILE *fp, size_t n)
+{
+  size_t rlen;  /* how much to read */
+  size_t nr;  /* number of chars actually read */
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  rlen = LUAL_BUFFERSIZE;  /* try to read that much each time */
+  do {
+    char *p = luaL_prepbuffer(&b);
+    if (rlen > n) rlen = n;  /* cannot read more than asked */
+    nr = fread(p, 1, rlen, fp);
+    luaL_addsize(&b, nr);
+    n -= nr;  /* still have to read `n' chars */
+  } while (n > 0 && nr == rlen);  /* until end of count or eof */
+  luaL_pushresult(&b);  /* close buffer */
+  return (n == 0 || lua_objlen(L, -1) > 0);
+}
+
+static int io_file_read(lua_State *L, FILE *fp, int start)
+{
+  int ok, n, nargs = (L->top - L->base) - start;
+  clearerr(fp);
+  if (nargs == 0) {
+    ok = io_file_readline(L, fp);
+    n = start+1;  /* Return 1 result. */
+  } else {
+    /* The results plus the buffers go on top of the args. */
+    luaL_checkstack(L, nargs+LUA_MINSTACK, "too many arguments");
+    ok = 1;
+    for (n = start; nargs-- && ok; n++) {
+      if (tvisstr(L->base+n)) {
+	const char *p = strVdata(L->base+n);
+	if (p[0] != '*')
+	  lj_err_arg(L, n+1, LJ_ERR_INVOPT);
+	if (p[1] == 'n')
+	  ok = io_file_readnum(L, fp);
+	else if (p[1] == 'l')
+	  ok = io_file_readline(L, fp);
+	else if (p[1] == 'a')
+	  io_file_readchars(L, fp, ~((size_t)0));
+	else
+	  lj_err_arg(L, n+1, LJ_ERR_INVFMT);
+      } else if (tvisnum(L->base+n)) {
+	size_t len = (size_t)lj_lib_checkint(L, n+1);
+	ok = len ? io_file_readchars(L, fp, len) : test_eof(L, fp);
+      } else {
+	lj_err_arg(L, n+1, LJ_ERR_INVOPT);
+      }
+    }
+  }
+  if (ferror(fp))
+    return io_pushresult(L, 0, NULL);
+  if (!ok)
+    setnilV(L->top-1);  /* Replace last result with nil. */
+  return n - start;
+}
+
+static int io_file_write(lua_State *L, FILE *fp, int start)
+{
+  cTValue *tv;
+  int status = 1;
+  for (tv = L->base+start; tv < L->top; tv++) {
+    if (tvisstr(tv)) {
+      MSize len = strV(tv)->len;
+      status = status && (fwrite(strVdata(tv), 1, len, fp) == len);
+    } else if (tvisnum(tv)) {
+      status = status && (fprintf(fp, LUA_NUMBER_FMT, numV(tv)) > 0);
+    } else {
+      lj_lib_checkstr(L, tv-L->base+1);
+    }
+  }
+  return io_pushresult(L, status, NULL);
+}
+
+/* -- I/O file methods ---------------------------------------------------- */
+
+#define LJLIB_MODULE_io_method
+
+LJLIB_CF(io_method_close)
+{
+  if (lua_isnone(L, 1))
+    lua_rawgeti(L, LUA_ENVIRONINDEX, IO_OUTPUT);
+  io_tofile(L);
+  return io_file_close(L);
+}
+
+LJLIB_CF(io_method_read)
+{
+  return io_file_read(L, io_tofile(L), 1);
+}
+
+LJLIB_CF(io_method_write)
+{
+  return io_file_write(L, io_tofile(L), 1);
+}
+
+LJLIB_CF(io_method_flush)
+{
+  return io_pushresult(L, fflush(io_tofile(L)) == 0, NULL);
+}
+
+LJLIB_CF(io_method_seek)
+{
+  FILE *fp = io_tofile(L);
+  int opt = lj_lib_checkopt(L, 2, 1, "\3set\3cur\3end");
+  lua_Number ofs;
+  int res;
+  if (opt == 0) opt = SEEK_SET;
+  else if (opt == 1) opt = SEEK_CUR;
+  else if (opt == 2) opt = SEEK_END;
+  lj_lib_opt(L, 3,
+    ofs = lj_lib_checknum(L, 3);
+    ,
+    ofs = 0;
+  )
+#if defined(LUA_USE_POSIX)
+  res = fseeko(fp, (int64_t)ofs, opt);
+#elif _MSC_VER >= 1400
+  res = _fseeki64(fp, (int64_t)ofs, opt);
+#elif defined(__MINGW32__)
+  res = fseeko64(fp, (int64_t)ofs, opt);
+#else
+  res = fseek(fp, (long)ofs, opt);
+#endif
+  if (res)
+    return io_pushresult(L, 0, NULL);
+#if defined(LUA_USE_POSIX)
+  ofs = cast_num(ftello(fp));
+#elif _MSC_VER >= 1400
+  ofs = cast_num(_ftelli64(fp));
+#elif defined(__MINGW32__)
+  ofs = cast_num(ftello64(fp));
+#else
+  ofs = cast_num(ftell(fp));
+#endif
+  setnumV(L->top-1, ofs);
+  return 1;
+}
+
+LJLIB_CF(io_method_setvbuf)
+{
+  FILE *fp = io_tofile(L);
+  int opt = lj_lib_checkopt(L, 2, -1, "\4full\4line\2no");
+  size_t sz = (size_t)lj_lib_optint(L, 3, LUAL_BUFFERSIZE);
+  if (opt == 0) opt = _IOFBF;
+  else if (opt == 1) opt = _IOLBF;
+  else if (opt == 2) opt = _IONBF;
+  return io_pushresult(L, (setvbuf(fp, NULL, opt, sz) == 0), NULL);
+}
+
+/* Forward declaration. */
+static void io_file_lines(lua_State *L, int idx, int toclose);
+
+LJLIB_CF(io_method_lines)
+{
+  io_tofile(L);
+  io_file_lines(L, 1, 0);
+  return 1;
+}
+
+LJLIB_CF(io_method___gc)
+{
+  FILE *fp = *io_tofilep(L);
+  if (fp != NULL) io_file_close(L);
+  return 0;
+}
+
+LJLIB_CF(io_method___tostring)
+{
+  FILE *fp = *io_tofilep(L);
+  if (fp == NULL)
+    lua_pushliteral(L, "file (closed)");
+  else
+    lua_pushfstring(L, "file (%p)", fp);
+  return 1;
+}
+
+LJLIB_PUSH(top-1) LJLIB_SET(__index)
+
+#include "lj_libdef.h"
+
+/* -- I/O library functions ----------------------------------------------- */
+
+#define LJLIB_MODULE_io
+
+LJLIB_PUSH(top-2) LJLIB_SET(!)  /* Set environment. */
+
+static FILE *io_file_get(lua_State *L, int findex)
+{
+  GCtab *fenv = tabref(curr_func(L)->c.env);
+  GCudata *ud = udataV(&tvref(fenv->array)[findex]);
+  FILE *fp = *(FILE **)uddata(ud);
+  if (fp == NULL)
+    lj_err_caller(L, LJ_ERR_IOSTDCL);
+  return fp;
+}
+
+LJLIB_CF(io_open)
+{
+  const char *fname = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  FILE **pf = io_file_new(L);
+  *pf = fopen(fname, mode);
+  return (*pf == NULL) ? io_pushresult(L, 0, fname) : 1;
+}
+
+LJLIB_CF(io_tmpfile)
+{
+  FILE **pf = io_file_new(L);
+  *pf = tmpfile();
+  return (*pf == NULL) ? io_pushresult(L, 0, NULL) : 1;
+}
+
+LJLIB_CF(io_close)
+{
+  return lj_cf_io_method_close(L);
+}
+
+LJLIB_CF(io_read)
+{
+  return io_file_read(L, io_file_get(L, IO_INPUT), 0);
+}
+
+LJLIB_CF(io_write)
+{
+  return io_file_write(L, io_file_get(L, IO_OUTPUT), 0);
+}
+
+LJLIB_CF(io_flush)
+{
+  return io_pushresult(L, fflush(io_file_get(L, IO_OUTPUT)) == 0, NULL);
+}
+
+LJLIB_NOREG LJLIB_CF(io_lines_iter)
+{
+  FILE *fp = *(FILE **)uddata(udataV(lj_lib_upvalue(L, 1)));
+  int ok;
+  if (fp == NULL)
+    lj_err_caller(L, LJ_ERR_IOCLFL);
+  ok = io_file_readline(L, fp);
+  if (ferror(fp))
+    return luaL_error(L, "%s", strerror(errno));
+  if (ok)
+    return 1;
+  if (tvistrue(lj_lib_upvalue(L, 2))) {  /* Need to close file? */
+    L->top = L->base+1;
+    setudataV(L, L->base, udataV(lj_lib_upvalue(L, 1)));
+    io_file_close(L);
+  }
+  return 0;
+}
+
+static void io_file_lines(lua_State *L, int idx, int toclose)
+{
+  lua_pushvalue(L, idx);
+  lua_pushboolean(L, toclose);
+  lua_pushcclosure(L, lj_cf_io_lines_iter, 2);
+  funcV(L->top-1)->c.ffid = FF_io_lines_iter;
+}
+
+LJLIB_CF(io_lines)
+{
+  if (lua_isnoneornil(L, 1)) {  /* no arguments? */
+    /* will iterate over default input */
+    lua_rawgeti(L, LUA_ENVIRONINDEX, IO_INPUT);
+    return lj_cf_io_method_lines(L);
+  } else {
+    const char *fname = luaL_checkstring(L, 1);
+    FILE **pf = io_file_new(L);
+    *pf = fopen(fname, "r");
+    if (*pf == NULL)
+      io_file_error(L, 1, fname);
+    io_file_lines(L, lua_gettop(L), 1);
+    return 1;
+  }
+}
+
+static int io_std_get(lua_State *L, int fp, const char *mode)
+{
+  if (!lua_isnoneornil(L, 1)) {
+    const char *fname = lua_tostring(L, 1);
+    if (fname) {
+      FILE **pf = io_file_new(L);
+      *pf = fopen(fname, mode);
+      if (*pf == NULL)
+	io_file_error(L, 1, fname);
+    } else {
+      io_tofile(L);  /* check that it's a valid file handle */
+      lua_pushvalue(L, 1);
+    }
+    lua_rawseti(L, LUA_ENVIRONINDEX, fp);
+  }
+  /* return current value */
+  lua_rawgeti(L, LUA_ENVIRONINDEX, fp);
+  return 1;
+}
+
+LJLIB_CF(io_input)
+{
+  return io_std_get(L, IO_INPUT, "r");
+}
+
+LJLIB_CF(io_output)
+{
+  return io_std_get(L, IO_OUTPUT, "w");
+}
+
+LJLIB_CF(io_type)
+{
+  void *ud;
+  luaL_checkany(L, 1);
+  ud = lua_touserdata(L, 1);
+  lua_getfield(L, LUA_REGISTRYINDEX, LUA_FILEHANDLE);
+  if (ud == NULL || !lua_getmetatable(L, 1) || !lua_rawequal(L, -2, -1))
+    lua_pushnil(L);  /* not a file */
+  else if (*((FILE **)ud) == NULL)
+    lua_pushliteral(L, "closed file");
+  else
+    lua_pushliteral(L, "file");
+  return 1;
+}
+
+LJLIB_PUSH(top-3) LJLIB_SET(!)  /* Set environment. */
+
+LJLIB_CF(io_popen)
+{
+#if defined(LUA_USE_POSIX) || defined(LUA_USE_WIN)
+  const char *fname = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  FILE **pf = io_file_new(L);
+#ifdef LUA_USE_POSIX
+  fflush(NULL);
+  *pf = popen(fname, mode);
+#else
+  *pf = _popen(fname, mode);
+#endif
+  return (*pf == NULL) ? io_pushresult(L, 0, fname) : 1;
+#else
+  luaL_error(L, LUA_QL("popen") " not supported");
+#endif
+}
+
+#include "lj_libdef.h"
+
+/* ------------------------------------------------------------------------ */
+
+static void io_std_new(lua_State *L, FILE *fp, int k, const char *fname)
+{
+  FILE **pf = io_file_new(L);
+  GCudata *ud = udataV(L->top-1);
+  GCtab *envt = tabV(L->top-2);
+  *pf = fp;
+  setgcref(ud->env, obj2gco(envt));
+  lj_gc_objbarrier(L, obj2gco(ud), envt);
+  if (k > 0) {
+    lua_pushvalue(L, -1);
+    lua_rawseti(L, -5, k);
+  }
+  lua_setfield(L, -3, fname);
+}
+
+static void io_fenv_new(lua_State *L, int narr, lua_CFunction cls)
+{
+  lua_createtable(L, narr, 1);
+  lua_pushcfunction(L, cls);
+  lua_setfield(L, -2, "__close");
+}
+
+LUALIB_API int luaopen_io(lua_State *L)
+{
+  LJ_LIB_REG_(L, NULL, io_method);
+  lua_setfield(L, LUA_REGISTRYINDEX, LUA_FILEHANDLE);
+  io_fenv_new(L, 0, lj_cf_io_pipe_close);  /* top-3 */
+  io_fenv_new(L, 2, lj_cf_io_file_close);  /* top-2 */
+  LJ_LIB_REG(L, io);
+  io_fenv_new(L, 0, lj_cf_io_std_close);
+  io_std_new(L, stdin, IO_INPUT, "stdin");
+  io_std_new(L, stdout, IO_OUTPUT, "stdout");
+  io_std_new(L, stderr, 0, "stderr");
+  lua_pop(L, 1);
+  return 1;
+}
+

+ 589 - 0
src/lib_jit.c

@@ -0,0 +1,589 @@
+/*
+** JIT library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lib_jit_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_arch.h"
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#if LJ_HASJIT
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#endif
+#include "lj_dispatch.h"
+#include "lj_vm.h"
+#include "lj_vmevent.h"
+#include "lj_lib.h"
+
+#include "luajit.h"
+
+/* -- jit.* functions ----------------------------------------------------- */
+
+#define LJLIB_MODULE_jit
+
+static int setjitmode(lua_State *L, int mode)
+{
+  int idx = 0;
+  if (L->base == L->top || tvisnil(L->base)) {  /* jit.on/off/flush([nil]) */
+    mode |= LUAJIT_MODE_ENGINE;
+  } else {
+    /* jit.on/off/flush(func|proto, nil|true|false) */
+    if (tvisfunc(L->base) || tvisproto(L->base))
+      idx = 1;
+    else if (!tvistrue(L->base))  /* jit.on/off/flush(true, nil|true|false) */
+      goto err;
+    if (L->base+1 < L->top && tvisbool(L->base+1))
+      mode |= boolV(L->base+1) ? LUAJIT_MODE_ALLFUNC : LUAJIT_MODE_ALLSUBFUNC;
+    else
+      mode |= LUAJIT_MODE_FUNC;
+  }
+  if (luaJIT_setmode(L, idx, mode) != 1) {
+  err:
+#if LJ_HASJIT
+    lj_err_arg(L, 1, LJ_ERR_NOLFUNC);
+#else
+    lj_err_caller(L, LJ_ERR_NOJIT);
+#endif
+  }
+  return 0;
+}
+
+LJLIB_CF(jit_on)
+{
+  return setjitmode(L, LUAJIT_MODE_ON);
+}
+
+LJLIB_CF(jit_off)
+{
+  return setjitmode(L, LUAJIT_MODE_OFF);
+}
+
+LJLIB_CF(jit_flush)
+{
+#if LJ_HASJIT
+  if (L->base < L->top && (tvisnum(L->base) || tvisstr(L->base))) {
+    int traceno = lj_lib_checkint(L, 1);
+    luaJIT_setmode(L, traceno, LUAJIT_MODE_FLUSH|LUAJIT_MODE_TRACE);
+    return 0;
+  }
+#endif
+  return setjitmode(L, LUAJIT_MODE_FLUSH);
+}
+
+#if LJ_HASJIT
+/* Push a string for every flag bit that is set. */
+static void flagbits_to_strings(lua_State *L, uint32_t flags, uint32_t base,
+				const char *str)
+{
+  for (; *str; base <<= 1, str += 1+*str)
+    if (flags & base)
+      setstrV(L, L->top++, lj_str_new(L, str+1, *(uint8_t *)str));
+}
+#endif
+
+LJLIB_CF(jit_status)
+{
+#if LJ_HASJIT
+  jit_State *J = L2J(L);
+  L->top = L->base;
+  setboolV(L->top++, (J->flags & JIT_F_ON) ? 1 : 0);
+  flagbits_to_strings(L, J->flags, JIT_F_CPU_FIRST, JIT_F_CPUSTRING);
+  flagbits_to_strings(L, J->flags, JIT_F_OPT_FIRST, JIT_F_OPTSTRING);
+  return L->top - L->base;
+#else
+  setboolV(L->top++, 0);
+  return 1;
+#endif
+}
+
+LJLIB_CF(jit_attach)
+{
+#ifdef LUAJIT_DISABLE_VMEVENT
+  luaL_error(L, "vmevent API disabled");
+#else
+  GCfunc *fn = lj_lib_checkfunc(L, 1);
+  GCstr *s = lj_lib_optstr(L, 2);
+  luaL_findtable(L, LUA_REGISTRYINDEX, LJ_VMEVENTS_REGKEY, LJ_VMEVENTS_HSIZE);
+  if (s) {  /* Attach to given event. */
+    lua_pushvalue(L, 1);
+    lua_rawseti(L, -2, VMEVENT_HASHIDX(s->hash));
+    G(L)->vmevmask = VMEVENT_NOCACHE;  /* Invalidate cache. */
+  } else {  /* Detach if no event given. */
+    setnilV(L->top++);
+    while (lua_next(L, -2)) {
+      L->top--;
+      if (tvisfunc(L->top) && funcV(L->top) == fn) {
+	setnilV(lj_tab_set(L, tabV(L->top-2), L->top-1));
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+LJLIB_PUSH(top-4) LJLIB_SET(arch)
+LJLIB_PUSH(top-3) LJLIB_SET(version_num)
+LJLIB_PUSH(top-2) LJLIB_SET(version)
+
+#include "lj_libdef.h"
+
+/* -- jit.util.* functions ------------------------------------------------ */
+
+#define LJLIB_MODULE_jit_util
+
+/* -- Reflection API for Lua functions ------------------------------------ */
+
+/* Return prototype of first argument (Lua function or prototype object) */
+static GCproto *check_Lproto(lua_State *L, int nolua)
+{
+  TValue *o = L->base;
+  if (L->top > o) {
+    if (tvisproto(o)) {
+      return protoV(o);
+    } else if (tvisfunc(o)) {
+      if (isluafunc(funcV(o)))
+	return funcproto(funcV(o));
+      else if (nolua)
+	return NULL;
+    }
+  }
+  lj_err_argt(L, 1, LUA_TFUNCTION);
+  return NULL;  /* unreachable */
+}
+
+static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val)
+{
+  setintV(lj_tab_setstr(L, t, lj_str_newz(L, name)), val);
+}
+
+/* local info = jit.util.funcinfo(func [,pc]) */
+LJLIB_CF(jit_util_funcinfo)
+{
+  GCproto *pt = check_Lproto(L, 1);
+  if (pt) {
+    BCPos pc = (BCPos)lj_lib_optint(L, 2, 0);
+    GCtab *t;
+    lua_createtable(L, 0, 16);  /* Increment hash size if fields are added. */
+    t = tabV(L->top-1);
+    setintfield(L, t, "linedefined", pt->linedefined);
+    setintfield(L, t, "lastlinedefined", pt->lastlinedefined);
+    setintfield(L, t, "stackslots", pt->framesize);
+    setintfield(L, t, "params", pt->numparams);
+    setintfield(L, t, "bytecodes", (int32_t)pt->sizebc);
+    setintfield(L, t, "gcconsts", (int32_t)pt->sizekgc);
+    setintfield(L, t, "nconsts", (int32_t)pt->sizekn);
+    setintfield(L, t, "upvalues", (int32_t)pt->sizeuv);
+    if (pc > 0)
+      setintfield(L, t, "currentline", pt->lineinfo ? pt->lineinfo[pc-1] : 0);
+    lua_pushboolean(L, (pt->flags & PROTO_IS_VARARG));
+    lua_setfield(L, -2, "isvararg");
+    setstrV(L, L->top++, pt->chunkname);
+    lua_setfield(L, -2, "source");
+    lj_err_pushloc(L, pt, pc);
+    lua_setfield(L, -2, "loc");
+  } else {
+    GCfunc *fn = funcV(L->base);
+    GCtab *t;
+    lua_createtable(L, 0, 2);  /* Increment hash size if fields are added. */
+    t = tabV(L->top-1);
+    setintfield(L, t, "ffid", fn->c.ffid);
+    setintfield(L, t, "upvalues", fn->c.nupvalues);
+  }
+  return 1;
+}
+
+/* local ins, m = jit.util.funcbc(func, pc) */
+LJLIB_CF(jit_util_funcbc)
+{
+  GCproto *pt = check_Lproto(L, 0);
+  BCPos pc = (BCPos)lj_lib_checkint(L, 2) - 1;
+  if (pc < pt->sizebc) {
+    BCIns ins = pt->bc[pc];
+    BCOp op = bc_op(ins);
+    lua_assert(op < BC__MAX);
+    setintV(L->top, ins);
+    setintV(L->top+1, lj_bc_mode[op]);
+    L->top += 2;
+    return 2;
+  }
+  return 0;
+}
+
+/* local k = jit.util.funck(func, idx) */
+LJLIB_CF(jit_util_funck)
+{
+  GCproto *pt = check_Lproto(L, 0);
+  MSize idx = (MSize)lj_lib_checkint(L, 2);
+  if ((int32_t)idx >= 0) {
+    if (idx < pt->sizekn) {
+      setnumV(L->top-1, pt->k.n[idx]);
+      return 1;
+    }
+  } else {
+    if (~idx < pt->sizekgc) {
+      GCobj *gc = gcref(pt->k.gc[idx]);
+      setgcV(L, L->top-1, &gc->gch, ~gc->gch.gct);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/* local name = jit.util.funcuvname(func, idx) */
+LJLIB_CF(jit_util_funcuvname)
+{
+  GCproto *pt = check_Lproto(L, 0);
+  uint32_t idx = (uint32_t)lj_lib_checkint(L, 2);
+  if (idx < pt->sizeuvname) {
+    setstrV(L, L->top-1, pt->uvname[idx]);
+    return 1;
+  }
+  return 0;
+}
+
+/* -- Reflection API for traces ------------------------------------------- */
+
+#if LJ_HASJIT
+
+/* Check trace argument. Must not throw for non-existent trace numbers. */
+static Trace *jit_checktrace(lua_State *L)
+{
+  TraceNo tr = (TraceNo)lj_lib_checkint(L, 1);
+  jit_State *J = L2J(L);
+  if (tr > 0 && tr < J->sizetrace)
+    return J->trace[tr];
+  return NULL;
+}
+
+/* local info = jit.util.traceinfo(tr) */
+LJLIB_CF(jit_util_traceinfo)
+{
+  Trace *T = jit_checktrace(L);
+  if (T) {
+    GCtab *t;
+    lua_createtable(L, 0, 4);  /* Increment hash size if fields are added. */
+    t = tabV(L->top-1);
+    setintfield(L, t, "nins", (int32_t)T->nins - REF_BIAS - 1);
+    setintfield(L, t, "nk", REF_BIAS - (int32_t)T->nk);
+    setintfield(L, t, "link", T->link);
+    setintfield(L, t, "nexit", T->nsnap);
+    /* There are many more fields. Add them only when needed. */
+    return 1;
+  }
+  return 0;
+}
+
+/* local m, ot, op1, op2, prev = jit.util.traceir(tr, idx) */
+LJLIB_CF(jit_util_traceir)
+{
+  Trace *T = jit_checktrace(L);
+  IRRef ref = (IRRef)lj_lib_checkint(L, 2) + REF_BIAS;
+  if (T && ref >= REF_BIAS && ref < T->nins) {
+    IRIns *ir = &T->ir[ref];
+    int32_t m = lj_ir_mode[ir->o];
+    setintV(L->top-2, m);
+    setintV(L->top-1, ir->ot);
+    setintV(L->top++, (int32_t)ir->op1 - (irm_op1(m)==IRMref ? REF_BIAS : 0));
+    setintV(L->top++, (int32_t)ir->op2 - (irm_op2(m)==IRMref ? REF_BIAS : 0));
+    setintV(L->top++, ir->prev);
+    return 5;
+  }
+  return 0;
+}
+
+/* local k, t [, slot] = jit.util.tracek(tr, idx) */
+LJLIB_CF(jit_util_tracek)
+{
+  Trace *T = jit_checktrace(L);
+  IRRef ref = (IRRef)lj_lib_checkint(L, 2) + REF_BIAS;
+  if (T && ref >= T->nk && ref < REF_BIAS) {
+    IRIns *ir = &T->ir[ref];
+    int32_t slot = -1;
+    if (ir->o == IR_KSLOT) {
+      slot = ir->op2;
+      ir = &T->ir[ir->op1];
+    }
+    lj_ir_kvalue(L, L->top-2, ir);
+    setintV(L->top-1, (int32_t)irt_type(ir->t));
+    if (slot == -1)
+      return 2;
+    setintV(L->top++, slot);
+    return 3;
+  }
+  return 0;
+}
+
+/* local snap = jit.util.tracesnap(tr, sn) */
+LJLIB_CF(jit_util_tracesnap)
+{
+  Trace *T = jit_checktrace(L);
+  SnapNo sn = (SnapNo)lj_lib_checkint(L, 2);
+  if (T && sn < T->nsnap) {
+    SnapShot *snap = &T->snap[sn];
+    IRRef2 *map = &T->snapmap[snap->mapofs];
+    BCReg s, nslots = snap->nslots;
+    GCtab *t;
+    lua_createtable(L, nslots ? (int)nslots : 1, 0);
+    t = tabV(L->top-1);
+    setintV(lj_tab_setint(L, t, 0), (int32_t)snap->ref - REF_BIAS);
+    for (s = 0; s < nslots; s++) {
+      TValue *o = lj_tab_setint(L, t, (int32_t)(s+1));
+      IRRef ref = snap_ref(map[s]);
+      if (ref)
+	setintV(o, (int32_t)ref - REF_BIAS);
+      else
+	setboolV(o, 0);
+    }
+    return 1;
+  }
+  return 0;
+}
+
+/* local mcode, addr, loop = jit.util.tracemc(tr) */
+LJLIB_CF(jit_util_tracemc)
+{
+  Trace *T = jit_checktrace(L);
+  if (T && T->mcode != NULL) {
+    setstrV(L, L->top-1, lj_str_new(L, (const char *)T->mcode, T->szmcode));
+    setnumV(L->top++, cast_num((intptr_t)T->mcode));
+    setintV(L->top++, T->mcloop);
+    return 3;
+  }
+  return 0;
+}
+
+/* local addr = jit.util.traceexitstub(idx) */
+LJLIB_CF(jit_util_traceexitstub)
+{
+  ExitNo exitno = (ExitNo)lj_lib_checkint(L, 1);
+  jit_State *J = L2J(L);
+  if (exitno < EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) {
+    setnumV(L->top-1, cast_num((intptr_t)exitstub_addr(J, exitno)));
+    return 1;
+  }
+  return 0;
+}
+
+#else
+
+static int trace_nojit(lua_State *L)
+{
+  UNUSED(L);
+  return 0;
+}
+#define lj_cf_jit_util_traceinfo	trace_nojit
+#define lj_cf_jit_util_traceir		trace_nojit
+#define lj_cf_jit_util_tracek		trace_nojit
+#define lj_cf_jit_util_tracesnap	trace_nojit
+#define lj_cf_jit_util_tracemc		trace_nojit
+#define lj_cf_jit_util_traceexitstub	trace_nojit
+
+#endif
+
+#include "lj_libdef.h"
+
+/* -- jit.opt module ------------------------------------------------------ */
+
+#define LJLIB_MODULE_jit_opt
+
+#if LJ_HASJIT
+/* Parse optimization level. */
+static int jitopt_level(jit_State *J, const char *str)
+{
+  if (str[0] >= '0' && str[0] <= '9' && str[1] == '\0') {
+    uint32_t flags;
+    if (str[0] == '0') flags = JIT_F_OPT_0;
+    else if (str[0] == '1') flags = JIT_F_OPT_1;
+    else if (str[0] == '2') flags = JIT_F_OPT_2;
+    else flags = JIT_F_OPT_3;
+    J->flags = (J->flags & ~JIT_F_OPT_MASK) | flags;
+    return 1;  /* Ok. */
+  }
+  return 0;  /* No match. */
+}
+
+/* Parse optimization flag. */
+static int jitopt_flag(jit_State *J, const char *str)
+{
+  const char *lst = JIT_F_OPTSTRING;
+  uint32_t opt;
+  int set = 1;
+  if (str[0] == '+') {
+    str++;
+  } else if (str[0] == '-') {
+    str++;
+    set = 0;
+  } else if (str[0] == 'n' && str[1] == 'o') {
+    str += str[2] == '-' ? 3 : 2;
+    set = 0;
+  }
+  for (opt = JIT_F_OPT_FIRST; ; opt <<= 1) {
+    size_t len = *(const uint8_t *)lst;
+    if (len == 0)
+      break;
+    if (strncmp(str, lst+1, len) == 0 && str[len] == '\0') {
+      if (set) J->flags |= opt; else J->flags &= ~opt;
+      return 1;  /* Ok. */
+    }
+    lst += 1+len;
+  }
+  return 0;  /* No match. */
+}
+
+/* Forward declaration. */
+static void jit_init_hotcount(jit_State *J);
+
+/* Parse optimization parameter. */
+static int jitopt_param(jit_State *J, const char *str)
+{
+  const char *lst = JIT_P_STRING;
+  int i;
+  for (i = 0; i < JIT_P__MAX; i++) {
+    size_t len = *(const uint8_t *)lst;
+    TValue tv;
+    lua_assert(len != 0);
+    if (strncmp(str, lst+1, len) == 0 && str[len] == '=' &&
+	lj_str_numconv(&str[len+1], &tv)) {
+      J->param[i] = lj_num2int(tv.n);
+      if (i == JIT_P_hotloop)
+	jit_init_hotcount(J);
+      return 1;  /* Ok. */
+    }
+    lst += 1+len;
+  }
+  return 0;  /* No match. */
+}
+#endif
+
+/* jit.opt.start(flags...) */
+LJLIB_CF(jit_opt_start)
+{
+#if LJ_HASJIT
+  jit_State *J = L2J(L);
+  int nargs = (int)(L->top - L->base);
+  if (nargs == 0) {
+    J->flags = (J->flags & ~JIT_F_OPT_MASK) | JIT_F_OPT_DEFAULT;
+  } else {
+    int i;
+    for (i = 1; i <= nargs; i++) {
+      const char *str = strdata(lj_lib_checkstr(L, i));
+      if (!jitopt_level(J, str) &&
+	  !jitopt_flag(J, str) &&
+	  !jitopt_param(J, str))
+	lj_err_callerv(L, LJ_ERR_JITOPT, str);
+    }
+  }
+#else
+  lj_err_caller(L, LJ_ERR_NOJIT);
+#endif
+  return 0;
+}
+
+#include "lj_libdef.h"
+
+/* -- JIT compiler initialization ----------------------------------------- */
+
+#if LJ_HASJIT
+/* Default values for JIT parameters. */
+static const int32_t jit_param_default[JIT_P__MAX+1] = {
+#define JIT_PARAMINIT(len, name, value)	(value),
+JIT_PARAMDEF(JIT_PARAMINIT)
+#undef JIT_PARAMINIT
+  0
+};
+
+/* Initialize hotcount table. */
+static void jit_init_hotcount(jit_State *J)
+{
+  HotCount start = (HotCount)J->param[JIT_P_hotloop];
+  HotCount *hotcount = J2GG(J)->hotcount;
+  uint32_t i;
+  for (i = 0; i < HOTCOUNT_SIZE; i++)
+    hotcount[i] = start;
+}
+#endif
+
+/* Arch-dependent CPU detection. */
+static uint32_t jit_cpudetect(lua_State *L)
+{
+  uint32_t flags = 0;
+#if LJ_TARGET_X86ORX64
+  uint32_t vendor[4];
+  uint32_t features[4];
+  if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
+#if !LJ_HASJIT
+#define JIT_F_CMOV	1
+#endif
+    flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
+#if LJ_HASJIT
+    flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
+    flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
+    if (vendor[2] == 0x6c65746e) {  /* Intel. */
+      if ((features[0] & 0x0ff00f00) == 0x00000f00)  /* P4. */
+	flags |= JIT_F_P4;  /* Currently unused. */
+      else if ((features[0] & 0x0fff0ff0) == 0x000106c0)  /* Atom. */
+	flags |= JIT_F_LEA_AGU;
+    } else if (vendor[2] == 0x444d4163) {  /* AMD. */
+      uint32_t fam = (features[0] & 0x0ff00f00);
+      if (fam == 0x00000f00)  /* K8. */
+	flags |= JIT_F_SPLIT_XMM;
+      if (fam >= 0x00000f00)  /* K8, K10. */
+	flags |= JIT_F_PREFER_IMUL;
+    }
+#endif
+  }
+#ifndef LUAJIT_CPU_NOCMOV
+  if (!(flags & JIT_F_CMOV))
+    luaL_error(L, "Ancient CPU lacks CMOV support (recompile with -DLUAJIT_CPU_NOCMOV)");
+#endif
+#if LJ_HASJIT
+  if (!(flags & JIT_F_SSE2))
+    luaL_error(L, "Sorry, SSE2 CPU support required for this beta release");
+#endif
+  UNUSED(L);
+#else
+#error "Missing CPU detection for this architecture"
+#endif
+  return flags;
+}
+
+/* Initialize JIT compiler. */
+static void jit_init(lua_State *L)
+{
+  uint32_t flags = jit_cpudetect(L);
+#if LJ_HASJIT
+  jit_State *J = L2J(L);
+  J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
+  memcpy(J->param, jit_param_default, sizeof(J->param));
+  jit_init_hotcount(J);
+  lj_dispatch_update(G(L));
+#else
+  UNUSED(flags);
+#endif
+}
+
+LUALIB_API int luaopen_jit(lua_State *L)
+{
+  lua_pushliteral(L, LJ_ARCH_NAME);
+  lua_pushinteger(L, LUAJIT_VERSION_NUM);
+  lua_pushliteral(L, LUAJIT_VERSION);
+  LJ_LIB_REG(L, jit);
+#ifndef LUAJIT_DISABLE_JITUTIL
+  LJ_LIB_REG_(L, "jit.util", jit_util);
+#endif
+  LJ_LIB_REG_(L, "jit.opt", jit_opt);
+  L->top -= 2;
+  jit_init(L);
+  return 1;
+}
+

+ 188 - 0
src/lib_math.c

@@ -0,0 +1,188 @@
+/*
+** Math library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#include <math.h>
+
+#define lib_math_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_math
+
+LJLIB_ASM(math_abs)		LJLIB_REC(.)
+{
+  lj_lib_checknum(L, 1);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(math_floor)		LJLIB_REC(math_round IRFPM_FLOOR)
+LJLIB_ASM_(math_ceil)		LJLIB_REC(math_round IRFPM_CEIL)
+LJLIB_ASM_(math_sqrt)		LJLIB_REC(math_unary IRFPM_SQRT)
+LJLIB_ASM_(math_log)		LJLIB_REC(math_unary IRFPM_LOG)
+LJLIB_ASM_(math_log10)		LJLIB_REC(math_unary IRFPM_LOG10)
+LJLIB_ASM_(math_exp)		LJLIB_REC(math_unary IRFPM_EXP)
+LJLIB_ASM_(math_sin)		LJLIB_REC(math_unary IRFPM_SIN)
+LJLIB_ASM_(math_cos)		LJLIB_REC(math_unary IRFPM_COS)
+LJLIB_ASM_(math_tan)		LJLIB_REC(math_unary IRFPM_TAN)
+LJLIB_ASM_(math_asin)		LJLIB_REC(math_atrig FF_math_asin)
+LJLIB_ASM_(math_acos)		LJLIB_REC(math_atrig FF_math_acos)
+LJLIB_ASM_(math_atan)		LJLIB_REC(math_atrig FF_math_atan)
+LJLIB_ASM_(math_sinh)
+LJLIB_ASM_(math_cosh)
+LJLIB_ASM_(math_tanh)
+LJLIB_ASM_(math_frexp)
+LJLIB_ASM_(math_modf)		LJLIB_REC(.)
+
+LJLIB_PUSH(57.29577951308232)
+LJLIB_ASM_(math_deg)		LJLIB_REC(math_degrad)
+
+LJLIB_PUSH(0.017453292519943295)
+LJLIB_ASM_(math_rad)		LJLIB_REC(math_degrad)
+
+LJLIB_ASM(math_atan2)		LJLIB_REC(math_binary IR_ATAN2)
+{
+  lj_lib_checknum(L, 1);
+  lj_lib_checknum(L, 2);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(math_ldexp)		LJLIB_REC(math_binary IR_LDEXP)
+LJLIB_ASM_(math_pow)		LJLIB_REC(.)
+LJLIB_ASM_(math_fmod)
+
+LJLIB_ASM(math_min)		LJLIB_REC(math_minmax IR_MIN)
+{
+  int i = 0;
+  do { lj_lib_checknum(L, ++i); } while (L->base+i < L->top);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(math_max)		LJLIB_REC(math_minmax IR_MAX)
+
+LJLIB_PUSH(3.14159265358979323846) LJLIB_SET(pi)
+LJLIB_PUSH(1e310) LJLIB_SET(huge)
+
+#ifdef __MACH__
+LJ_FUNCA double lj_wrapper_sinh(double x) { return sinh(x); }
+LJ_FUNCA double lj_wrapper_cosh(double x) { return cosh(x); }
+LJ_FUNCA double lj_wrapper_tanh(double x) { return tanh(x); }
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+/* This implements a Tausworthe PRNG with period 2^223. Based on:
+**   Tables of maximally-equidistributed combined LFSR generators,
+**   Pierre L'Ecuyer, 1991, table 3, 1st entry.
+** Full-period ME-CF generator with L=64, J=4, k=223, N1=49.
+*/
+
+/* PRNG state. */
+typedef struct TW223State {
+  uint64_t gen[4];	/* State of the 4 LFSR generators. */
+  int valid;		/* State is valid. */
+} TW223State;
+
+/* Union needed for bit-pattern conversion between uint64_t and double. */
+typedef union { uint64_t u64; double d; } U64double;
+
+/* Update generator i and compute a running xor of all states. */
+#define TW223_GEN(i, k, q, s) \
+  z = tw->gen[i]; \
+  z = (((z<<q)^z) >> (k-s)) ^ ((z&((uint64_t)(int64_t)-1 << (64-k)))<<s); \
+  r ^= z; tw->gen[i] = z;
+
+/* PRNG step function. Returns a double in the range 0.0 <= d < 1.0. */
+static double tw223_step(TW223State *tw)
+{
+  uint64_t z, r = 0;
+  U64double u;
+  TW223_GEN(0, 63, 31, 18)
+  TW223_GEN(1, 58, 19, 28)
+  TW223_GEN(2, 55, 24,  7)
+  TW223_GEN(3, 47, 21,  8)
+  u.u64 = (r & (((uint64_t)1 << 52)-1)) | ((uint64_t)0x3ff << 52);
+#if defined(__GNUC__) && LJ_TARGET_X86 && __pic__
+  /* Compensate for unbelievable GCC pessimization. */
+  {
+    volatile U64double u1;
+    u1.u64 = (uint64_t)0x3f8 << 52;
+    return u.d - u1.d;
+  }
+#else
+  return u.d - 1.0;
+#endif
+}
+
+/* PRNG initialization function. */
+static void tw223_init(TW223State *tw, double d)
+{
+  uint32_t r = 0x11090601;  /* 64-k[i] as four 8 bit constants. */
+  int i;
+  for (i = 0; i < 4; i++) {
+    U64double u;
+    uint32_t m = 1u << (r&255);
+    r >>= 8;
+    u.d = d = d * 3.14159265358979323846 + 2.7182818284590452354;
+    if (u.u64 < m) u.u64 += m;  /* Ensure k[i] MSB of gen[i] are non-zero. */
+    tw->gen[i] = u.u64;
+  }
+  tw->valid = 1;
+  for (i = 0; i < 10; i++)
+    tw223_step(tw);
+}
+
+/* PRNG extract function. */
+LJLIB_PUSH(top-2)  /* Upvalue holds userdata with TW223State. */
+LJLIB_CF(math_random)
+{
+  int n = cast_int(L->top - L->base);
+  TW223State *tw = (TW223State *)(uddata(udataV(lj_lib_upvalue(L, 1))));
+  double d;
+  if (LJ_UNLIKELY(!tw->valid)) tw223_init(tw, 0.0);
+  d = tw223_step(tw);
+  if (n > 0) {
+    double r1 = lj_lib_checknum(L, 1);
+    if (n == 1) {
+      d = floor(d*r1) + 1.0;  /* d is an int in range [1, r1] */
+    } else {
+      double r2 = lj_lib_checknum(L, 2);
+      d = floor(d*(r2-r1+1.0)) + r1;  /* d is an int in range [r1, r2] */
+    }
+  }  /* else: d is a double in range [0, 1] */
+  setnumV(L->top++, d);
+  return 1;
+}
+
+/* PRNG seed function. */
+LJLIB_PUSH(top-2)  /* Upvalue holds userdata with TW223State. */
+LJLIB_CF(math_randomseed)
+{
+  TW223State *tw = (TW223State *)(uddata(udataV(lj_lib_upvalue(L, 1))));
+  tw223_init(tw, lj_lib_checknum(L, 1));
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_math(lua_State *L)
+{
+  TW223State *tw;
+  tw = (TW223State *)lua_newuserdata(L, sizeof(TW223State));
+  tw->valid = 0;  /* Use lazy initialization to save some time on startup. */
+  LJ_LIB_REG(L, math);
+#if defined(LUA_COMPAT_MOD)
+  lua_getfield(L, -1, "fmod");
+  lua_setfield(L, -2, "mod");
+#endif
+  return 1;
+}
+

+ 249 - 0
src/lib_os.c

@@ -0,0 +1,249 @@
+/*
+** OS library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#include <errno.h>
+#include <locale.h>
+#include <time.h>
+
+#define lib_os_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#ifdef LUA_USE_POSIX
+#include <unistd.h>
+#else
+#include <stdio.h>
+#endif
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_os
+
+static int os_pushresult(lua_State *L, int i, const char *filename)
+{
+  int en = errno;  /* calls to Lua API may change this value */
+  if (i) {
+    setboolV(L->top-1, 1);
+    return 1;
+  } else {
+    setnilV(L->top-1);
+    lua_pushfstring(L, "%s: %s", filename, strerror(en));
+    lua_pushinteger(L, en);
+    return 3;
+  }
+}
+
+LJLIB_CF(os_execute)
+{
+  lua_pushinteger(L, system(luaL_optstring(L, 1, NULL)));
+  return 1;
+}
+
+LJLIB_CF(os_remove)
+{
+  const char *filename = luaL_checkstring(L, 1);
+  return os_pushresult(L, remove(filename) == 0, filename);
+}
+
+LJLIB_CF(os_rename)
+{
+  const char *fromname = luaL_checkstring(L, 1);
+  const char *toname = luaL_checkstring(L, 2);
+  return os_pushresult(L, rename(fromname, toname) == 0, fromname);
+}
+
+LJLIB_CF(os_tmpname)
+{
+#ifdef LUA_USE_POSIX
+  char buf[15+1];
+  int fp;
+  strcpy(buf, "/tmp/lua_XXXXXX");
+  fp = mkstemp(buf);
+  if (fp != -1)
+    close(fp);
+  else
+    lj_err_caller(L, LJ_ERR_OSUNIQF);
+#else
+  char buf[L_tmpnam];
+  if (tmpnam(buf) == NULL)
+    lj_err_caller(L, LJ_ERR_OSUNIQF);
+#endif
+  lua_pushstring(L, buf);
+  return 1;
+}
+
+LJLIB_CF(os_getenv)
+{
+  lua_pushstring(L, getenv(luaL_checkstring(L, 1)));  /* if NULL push nil */
+  return 1;
+}
+
+LJLIB_CF(os_exit)
+{
+  exit(lj_lib_optint(L, 1, EXIT_SUCCESS));
+  return 0;  /* to avoid warnings */
+}
+
+LJLIB_CF(os_clock)
+{
+  setnumV(L->top++, ((lua_Number)clock())*(1.0/(lua_Number)CLOCKS_PER_SEC));
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static void setfield(lua_State *L, const char *key, int value)
+{
+  lua_pushinteger(L, value);
+  lua_setfield(L, -2, key);
+}
+
+static void setboolfield(lua_State *L, const char *key, int value)
+{
+  if (value < 0)  /* undefined? */
+    return;  /* does not set field */
+  lua_pushboolean(L, value);
+  lua_setfield(L, -2, key);
+}
+
+static int getboolfield(lua_State *L, const char *key)
+{
+  int res;
+  lua_getfield(L, -1, key);
+  res = lua_isnil(L, -1) ? -1 : lua_toboolean(L, -1);
+  lua_pop(L, 1);
+  return res;
+}
+
+static int getfield(lua_State *L, const char *key, int d)
+{
+  int res;
+  lua_getfield(L, -1, key);
+  if (lua_isnumber(L, -1)) {
+    res = (int)lua_tointeger(L, -1);
+  } else {
+    if (d < 0)
+      lj_err_callerv(L, LJ_ERR_OSDATEF, key);
+    res = d;
+  }
+  lua_pop(L, 1);
+  return res;
+}
+
+LJLIB_CF(os_date)
+{
+  const char *s = luaL_optstring(L, 1, "%c");
+  time_t t = luaL_opt(L, (time_t)luaL_checknumber, 2, time(NULL));
+  struct tm *stm;
+  if (*s == '!') {  /* UTC? */
+    stm = gmtime(&t);
+    s++;  /* skip `!' */
+  } else {
+    stm = localtime(&t);
+  }
+  if (stm == NULL) {  /* invalid date? */
+    setnilV(L->top-1);
+  } else if (strcmp(s, "*t") == 0) {
+    lua_createtable(L, 0, 9);  /* 9 = number of fields */
+    setfield(L, "sec", stm->tm_sec);
+    setfield(L, "min", stm->tm_min);
+    setfield(L, "hour", stm->tm_hour);
+    setfield(L, "day", stm->tm_mday);
+    setfield(L, "month", stm->tm_mon+1);
+    setfield(L, "year", stm->tm_year+1900);
+    setfield(L, "wday", stm->tm_wday+1);
+    setfield(L, "yday", stm->tm_yday+1);
+    setboolfield(L, "isdst", stm->tm_isdst);
+  } else {
+    char cc[3];
+    luaL_Buffer b;
+    cc[0] = '%'; cc[2] = '\0';
+    luaL_buffinit(L, &b);
+    for (; *s; s++) {
+      if (*s != '%' || *(s + 1) == '\0') {  /* no conversion specifier? */
+	luaL_addchar(&b, *s);
+      } else {
+	size_t reslen;
+	char buff[200];  /* should be big enough for any conversion result */
+	cc[1] = *(++s);
+	reslen = strftime(buff, sizeof(buff), cc, stm);
+	luaL_addlstring(&b, buff, reslen);
+      }
+    }
+    luaL_pushresult(&b);
+  }
+  return 1;
+}
+
+LJLIB_CF(os_time)
+{
+  time_t t;
+  if (lua_isnoneornil(L, 1)) {  /* called without args? */
+    t = time(NULL);  /* get current time */
+  } else {
+    struct tm ts;
+    luaL_checktype(L, 1, LUA_TTABLE);
+    lua_settop(L, 1);  /* make sure table is at the top */
+    ts.tm_sec = getfield(L, "sec", 0);
+    ts.tm_min = getfield(L, "min", 0);
+    ts.tm_hour = getfield(L, "hour", 12);
+    ts.tm_mday = getfield(L, "day", -1);
+    ts.tm_mon = getfield(L, "month", -1) - 1;
+    ts.tm_year = getfield(L, "year", -1) - 1900;
+    ts.tm_isdst = getboolfield(L, "isdst");
+    t = mktime(&ts);
+  }
+  if (t == (time_t)(-1))
+    lua_pushnil(L);
+  else
+    lua_pushnumber(L, (lua_Number)t);
+  return 1;
+}
+
+LJLIB_CF(os_difftime)
+{
+  lua_pushnumber(L, difftime((time_t)(luaL_checknumber(L, 1)),
+			     (time_t)(luaL_optnumber(L, 2, (lua_Number)0))));
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+LJLIB_CF(os_setlocale)
+{
+  GCstr *s = lj_lib_optstr(L, 1);
+  const char *str = s ? strdata(s) : NULL;
+  int opt = lj_lib_checkopt(L, 2, 6,
+    "\5ctype\7numeric\4time\7collate\10monetary\1\377\3all");
+  if (opt == 0) opt = LC_CTYPE;
+  else if (opt == 1) opt = LC_NUMERIC;
+  else if (opt == 2) opt = LC_TIME;
+  else if (opt == 3) opt = LC_COLLATE;
+  else if (opt == 4) opt = LC_MONETARY;
+  else if (opt == 6) opt = LC_ALL;
+  lua_pushstring(L, setlocale(opt, str));
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_os(lua_State *L)
+{
+  LJ_LIB_REG(L, os);
+  return 1;
+}
+

+ 508 - 0
src/lib_package.c

@@ -0,0 +1,508 @@
+/*
+** Package library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lib_package_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* Error codes for ll_loadfunc. */
+#define PACKAGE_ERR_LIB		1
+#define PACKAGE_ERR_FUNC	2
+
+/* Redefined in platform specific part. */
+#define PACKAGE_LIB_FAIL	"open"
+#define setprogdir(L)		((void)0)
+
+#if defined(LUA_DL_DLOPEN)
+
+#include <dlfcn.h>
+
+static void ll_unloadlib(void *lib)
+{
+  dlclose(lib);
+}
+
+static void *ll_load(lua_State *L, const char *path)
+{
+  void *lib = dlopen(path, RTLD_NOW);
+  if (lib == NULL) lua_pushstring(L, dlerror());
+  return lib;
+}
+
+static lua_CFunction ll_sym(lua_State *L, void *lib, const char *sym)
+{
+  lua_CFunction f = (lua_CFunction)dlsym(lib, sym);
+  if (f == NULL) lua_pushstring(L, dlerror());
+  return f;
+}
+
+#elif defined(LUA_DL_DLL)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#undef setprogdir
+
+static void setprogdir(lua_State *L)
+{
+  char buff[MAX_PATH + 1];
+  char *lb;
+  DWORD nsize = sizeof(buff);
+  DWORD n = GetModuleFileNameA(NULL, buff, nsize);
+  if (n == 0 || n == nsize || (lb = strrchr(buff, '\\')) == NULL) {
+    luaL_error(L, "unable to get ModuleFileName");
+  } else {
+    *lb = '\0';
+    luaL_gsub(L, lua_tostring(L, -1), LUA_EXECDIR, buff);
+    lua_remove(L, -2);  /* remove original string */
+  }
+}
+
+static void pusherror(lua_State *L)
+{
+  DWORD error = GetLastError();
+  char buffer[128];
+  if (FormatMessageA(FORMAT_MESSAGE_IGNORE_INSERTS | FORMAT_MESSAGE_FROM_SYSTEM,
+      NULL, error, 0, buffer, sizeof(buffer), NULL))
+    lua_pushstring(L, buffer);
+  else
+    lua_pushfstring(L, "system error %d\n", error);
+}
+
+static void ll_unloadlib(void *lib)
+{
+  FreeLibrary((HINSTANCE)lib);
+}
+
+static void *ll_load(lua_State *L, const char *path)
+{
+  HINSTANCE lib = LoadLibraryA(path);
+  if (lib == NULL) pusherror(L);
+  return lib;
+}
+
+static lua_CFunction ll_sym(lua_State *L, void *lib, const char *sym)
+{
+  lua_CFunction f = (lua_CFunction)GetProcAddress((HINSTANCE)lib, sym);
+  if (f == NULL) pusherror(L);
+  return f;
+}
+
+#else
+
+#undef PACKAGE_LIB_FAIL
+#define PACKAGE_LIB_FAIL	"absent"
+
+#define DLMSG	"dynamic libraries not enabled; check your Lua installation"
+
+static void ll_unloadlib(void *lib)
+{
+  (void)lib;
+}
+
+static void *ll_load(lua_State *L, const char *path)
+{
+  (void)path;
+  lua_pushliteral(L, DLMSG);
+  return NULL;
+}
+
+static lua_CFunction ll_sym(lua_State *L, void *lib, const char *sym)
+{
+  (void)lib; (void)sym;
+  lua_pushliteral(L, DLMSG);
+  return NULL;
+}
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+static void **ll_register(lua_State *L, const char *path)
+{
+  void **plib;
+  lua_pushfstring(L, "LOADLIB: %s", path);
+  lua_gettable(L, LUA_REGISTRYINDEX);  /* check library in registry? */
+  if (!lua_isnil(L, -1)) {  /* is there an entry? */
+    plib = (void **)lua_touserdata(L, -1);
+  } else {  /* no entry yet; create one */
+    lua_pop(L, 1);
+    plib = (void **)lua_newuserdata(L, sizeof(void *));
+    *plib = NULL;
+    luaL_getmetatable(L, "_LOADLIB");
+    lua_setmetatable(L, -2);
+    lua_pushfstring(L, "LOADLIB: %s", path);
+    lua_pushvalue(L, -2);
+    lua_settable(L, LUA_REGISTRYINDEX);
+  }
+  return plib;
+}
+
+static int ll_loadfunc(lua_State *L, const char *path, const char *sym)
+{
+  void **reg = ll_register(L, path);
+  if (*reg == NULL) *reg = ll_load(L, path);
+  if (*reg == NULL) {
+    return PACKAGE_ERR_LIB;  /* unable to load library */
+  } else {
+    lua_CFunction f = ll_sym(L, *reg, sym);
+    if (f == NULL)
+      return PACKAGE_ERR_FUNC;  /* unable to find function */
+    lua_pushcfunction(L, f);
+    return 0;  /* return function */
+  }
+}
+
+static int lj_cf_package_loadlib(lua_State *L)
+{
+  const char *path = luaL_checkstring(L, 1);
+  const char *init = luaL_checkstring(L, 2);
+  int stat = ll_loadfunc(L, path, init);
+  if (stat == 0) {  /* no errors? */
+    return 1;  /* return the loaded function */
+  } else {  /* error; error message is on stack top */
+    lua_pushnil(L);
+    lua_insert(L, -2);
+    lua_pushstring(L, (stat == PACKAGE_ERR_LIB) ?  PACKAGE_LIB_FAIL : "init");
+    return 3;  /* return nil, error message, and where */
+  }
+}
+
+static int lj_cf_package_unloadlib(lua_State *L)
+{
+  void **lib = (void **)luaL_checkudata(L, 1, "_LOADLIB");
+  if (*lib) ll_unloadlib(*lib);
+  *lib = NULL;  /* mark library as closed */
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int readable(const char *filename)
+{
+  FILE *f = fopen(filename, "r");  /* try to open file */
+  if (f == NULL) return 0;  /* open failed */
+  fclose(f);
+  return 1;
+}
+
+static const char *pushnexttemplate(lua_State *L, const char *path)
+{
+  const char *l;
+  while (*path == *LUA_PATHSEP) path++;  /* skip separators */
+  if (*path == '\0') return NULL;  /* no more templates */
+  l = strchr(path, *LUA_PATHSEP);  /* find next separator */
+  if (l == NULL) l = path + strlen(path);
+  lua_pushlstring(L, path, (size_t)(l - path));  /* template */
+  return l;
+}
+
+static const char *findfile(lua_State *L, const char *name,
+			    const char *pname)
+{
+  const char *path;
+  name = luaL_gsub(L, name, ".", LUA_DIRSEP);
+  lua_getfield(L, LUA_ENVIRONINDEX, pname);
+  path = lua_tostring(L, -1);
+  if (path == NULL)
+    luaL_error(L, LUA_QL("package.%s") " must be a string", pname);
+  lua_pushliteral(L, "");  /* error accumulator */
+  while ((path = pushnexttemplate(L, path)) != NULL) {
+    const char *filename;
+    filename = luaL_gsub(L, lua_tostring(L, -1), LUA_PATH_MARK, name);
+    lua_remove(L, -2);  /* remove path template */
+    if (readable(filename))  /* does file exist and is readable? */
+      return filename;  /* return that file name */
+    lua_pushfstring(L, "\n\tno file " LUA_QS, filename);
+    lua_remove(L, -2);  /* remove file name */
+    lua_concat(L, 2);  /* add entry to possible error message */
+  }
+  return NULL;  /* not found */
+}
+
+static void loaderror(lua_State *L, const char *filename)
+{
+  luaL_error(L, "error loading module " LUA_QS " from file " LUA_QS ":\n\t%s",
+	     lua_tostring(L, 1), filename, lua_tostring(L, -1));
+}
+
+static int lj_cf_package_loader_lua(lua_State *L)
+{
+  const char *filename;
+  const char *name = luaL_checkstring(L, 1);
+  filename = findfile(L, name, "path");
+  if (filename == NULL) return 1;  /* library not found in this path */
+  if (luaL_loadfile(L, filename) != 0)
+    loaderror(L, filename);
+  return 1;  /* library loaded successfully */
+}
+
+static const char *mkfuncname(lua_State *L, const char *modname)
+{
+  const char *funcname;
+  const char *mark = strchr(modname, *LUA_IGMARK);
+  if (mark) modname = mark + 1;
+  funcname = luaL_gsub(L, modname, ".", "_");
+  funcname = lua_pushfstring(L, "luaopen_%s", funcname);
+  lua_remove(L, -2);  /* remove 'gsub' result */
+  return funcname;
+}
+
+static int lj_cf_package_loader_c(lua_State *L)
+{
+  const char *funcname;
+  const char *name = luaL_checkstring(L, 1);
+  const char *filename = findfile(L, name, "cpath");
+  if (filename == NULL) return 1;  /* library not found in this path */
+  funcname = mkfuncname(L, name);
+  if (ll_loadfunc(L, filename, funcname) != 0)
+    loaderror(L, filename);
+  return 1;  /* library loaded successfully */
+}
+
+static int lj_cf_package_loader_croot(lua_State *L)
+{
+  const char *funcname;
+  const char *filename;
+  const char *name = luaL_checkstring(L, 1);
+  const char *p = strchr(name, '.');
+  int stat;
+  if (p == NULL) return 0;  /* is root */
+  lua_pushlstring(L, name, (size_t)(p - name));
+  filename = findfile(L, lua_tostring(L, -1), "cpath");
+  if (filename == NULL) return 1;  /* root not found */
+  funcname = mkfuncname(L, name);
+  if ((stat = ll_loadfunc(L, filename, funcname)) != 0) {
+    if (stat != PACKAGE_ERR_FUNC) loaderror(L, filename);  /* real error */
+    lua_pushfstring(L, "\n\tno module " LUA_QS " in file " LUA_QS,
+		    name, filename);
+    return 1;  /* function not found */
+  }
+  return 1;
+}
+
+static int lj_cf_package_loader_preload(lua_State *L)
+{
+  const char *name = luaL_checkstring(L, 1);
+  lua_getfield(L, LUA_ENVIRONINDEX, "preload");
+  if (!lua_istable(L, -1))
+    luaL_error(L, LUA_QL("package.preload") " must be a table");
+  lua_getfield(L, -1, name);
+  if (lua_isnil(L, -1))  /* not found? */
+    lua_pushfstring(L, "\n\tno field package.preload['%s']", name);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const int sentinel_ = 0;
+#define sentinel	((void *)&sentinel_)
+
+static int lj_cf_package_require(lua_State *L)
+{
+  const char *name = luaL_checkstring(L, 1);
+  int i;
+  lua_settop(L, 1);  /* _LOADED table will be at index 2 */
+  lua_getfield(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_getfield(L, 2, name);
+  if (lua_toboolean(L, -1)) {  /* is it there? */
+    if (lua_touserdata(L, -1) == sentinel)  /* check loops */
+      luaL_error(L, "loop or previous error loading module " LUA_QS, name);
+    return 1;  /* package is already loaded */
+  }
+  /* else must load it; iterate over available loaders */
+  lua_getfield(L, LUA_ENVIRONINDEX, "loaders");
+  if (!lua_istable(L, -1))
+    luaL_error(L, LUA_QL("package.loaders") " must be a table");
+  lua_pushliteral(L, "");  /* error message accumulator */
+  for (i = 1; ; i++) {
+    lua_rawgeti(L, -2, i);  /* get a loader */
+    if (lua_isnil(L, -1))
+      luaL_error(L, "module " LUA_QS " not found:%s",
+		 name, lua_tostring(L, -2));
+    lua_pushstring(L, name);
+    lua_call(L, 1, 1);  /* call it */
+    if (lua_isfunction(L, -1))  /* did it find module? */
+      break;  /* module loaded successfully */
+    else if (lua_isstring(L, -1))  /* loader returned error message? */
+      lua_concat(L, 2);  /* accumulate it */
+    else
+      lua_pop(L, 1);
+  }
+  lua_pushlightuserdata(L, sentinel);
+  lua_setfield(L, 2, name);  /* _LOADED[name] = sentinel */
+  lua_pushstring(L, name);  /* pass name as argument to module */
+  lua_call(L, 1, 1);  /* run loaded module */
+  if (!lua_isnil(L, -1))  /* non-nil return? */
+    lua_setfield(L, 2, name);  /* _LOADED[name] = returned value */
+  lua_getfield(L, 2, name);
+  if (lua_touserdata(L, -1) == sentinel) {   /* module did not set a value? */
+    lua_pushboolean(L, 1);  /* use true as result */
+    lua_pushvalue(L, -1);  /* extra copy to be returned */
+    lua_setfield(L, 2, name);  /* _LOADED[name] = true */
+  }
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static void setfenv(lua_State *L)
+{
+  lua_Debug ar;
+  if (lua_getstack(L, 1, &ar) == 0 ||
+      lua_getinfo(L, "f", &ar) == 0 ||  /* get calling function */
+      lua_iscfunction(L, -1))
+    luaL_error(L, LUA_QL("module") " not called from a Lua function");
+  lua_pushvalue(L, -2);
+  lua_setfenv(L, -2);
+  lua_pop(L, 1);
+}
+
+static void dooptions(lua_State *L, int n)
+{
+  int i;
+  for (i = 2; i <= n; i++) {
+    lua_pushvalue(L, i);  /* get option (a function) */
+    lua_pushvalue(L, -2);  /* module */
+    lua_call(L, 1, 0);
+  }
+}
+
+static void modinit(lua_State *L, const char *modname)
+{
+  const char *dot;
+  lua_pushvalue(L, -1);
+  lua_setfield(L, -2, "_M");  /* module._M = module */
+  lua_pushstring(L, modname);
+  lua_setfield(L, -2, "_NAME");
+  dot = strrchr(modname, '.');  /* look for last dot in module name */
+  if (dot == NULL) dot = modname; else dot++;
+  /* set _PACKAGE as package name (full module name minus last part) */
+  lua_pushlstring(L, modname, (size_t)(dot - modname));
+  lua_setfield(L, -2, "_PACKAGE");
+}
+
+static int lj_cf_package_module(lua_State *L)
+{
+  const char *modname = luaL_checkstring(L, 1);
+  int loaded = lua_gettop(L) + 1;  /* index of _LOADED table */
+  lua_getfield(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_getfield(L, loaded, modname);  /* get _LOADED[modname] */
+  if (!lua_istable(L, -1)) {  /* not found? */
+    lua_pop(L, 1);  /* remove previous result */
+    /* try global variable (and create one if it does not exist) */
+    if (luaL_findtable(L, LUA_GLOBALSINDEX, modname, 1) != NULL)
+      lj_err_callerv(L, LJ_ERR_BADMODN, modname);
+    lua_pushvalue(L, -1);
+    lua_setfield(L, loaded, modname);  /* _LOADED[modname] = new table */
+  }
+  /* check whether table already has a _NAME field */
+  lua_getfield(L, -1, "_NAME");
+  if (!lua_isnil(L, -1)) {  /* is table an initialized module? */
+    lua_pop(L, 1);
+  } else {  /* no; initialize it */
+    lua_pop(L, 1);
+    modinit(L, modname);
+  }
+  lua_pushvalue(L, -1);
+  setfenv(L);
+  dooptions(L, loaded - 1);
+  return 0;
+}
+
+static int lj_cf_package_seeall(lua_State *L)
+{
+  luaL_checktype(L, 1, LUA_TTABLE);
+  if (!lua_getmetatable(L, 1)) {
+    lua_createtable(L, 0, 1); /* create new metatable */
+    lua_pushvalue(L, -1);
+    lua_setmetatable(L, 1);
+  }
+  lua_pushvalue(L, LUA_GLOBALSINDEX);
+  lua_setfield(L, -2, "__index");  /* mt.__index = _G */
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#define AUXMARK		"\1"
+
+static void setpath(lua_State *L, const char *fieldname, const char *envname,
+		    const char *def)
+{
+  const char *path = getenv(envname);
+  if (path == NULL) {
+    lua_pushstring(L, def);
+  } else {
+    path = luaL_gsub(L, path, LUA_PATHSEP LUA_PATHSEP,
+			      LUA_PATHSEP AUXMARK LUA_PATHSEP);
+    luaL_gsub(L, path, AUXMARK, def);
+    lua_remove(L, -2);
+  }
+  setprogdir(L);
+  lua_setfield(L, -2, fieldname);
+}
+
+static const luaL_Reg package_lib[] = {
+  { "loadlib",	lj_cf_package_loadlib },
+  { "seeall",	lj_cf_package_seeall },
+  { NULL, NULL }
+};
+
+static const luaL_Reg package_global[] = {
+  { "module",	lj_cf_package_module },
+  { "require",	lj_cf_package_require },
+  { NULL, NULL }
+};
+
+static const lua_CFunction package_loaders[] =
+{
+  lj_cf_package_loader_preload,
+  lj_cf_package_loader_lua,
+  lj_cf_package_loader_c,
+  lj_cf_package_loader_croot,
+  NULL
+};
+
+LUALIB_API int luaopen_package(lua_State *L)
+{
+  int i;
+  luaL_newmetatable(L, "_LOADLIB");
+  lua_pushcfunction(L, lj_cf_package_unloadlib);
+  lua_setfield(L, -2, "__gc");
+  luaL_register(L, LUA_LOADLIBNAME, package_lib);
+  lua_pushvalue(L, -1);
+  lua_replace(L, LUA_ENVIRONINDEX);
+  lua_createtable(L, sizeof(package_loaders)/sizeof(package_loaders[0])-1, 0);
+  for (i = 0; package_loaders[i] != NULL; i++) {
+    lua_pushcfunction(L, package_loaders[i]);
+    lua_rawseti(L, -2, i+1);
+  }
+  lua_setfield(L, -2, "loaders");
+  setpath(L, "path", LUA_PATH, LUA_PATH_DEFAULT);
+  setpath(L, "cpath", LUA_CPATH, LUA_CPATH_DEFAULT);
+  lua_pushliteral(L, LUA_PATH_CONFIG);
+  lua_setfield(L, -2, "config");
+  luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 16);
+  lua_setfield(L, -2, "loaded");
+  lua_newtable(L);
+  lua_setfield(L, -2, "preload");
+  lua_pushvalue(L, LUA_GLOBALSINDEX);
+  luaL_register(L, NULL, package_global);
+  lua_pop(L, 1);
+  return 1;
+}
+

+ 790 - 0
src/lib_string.c

@@ -0,0 +1,790 @@
+/*
+** String library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#include <stdio.h>
+
+#define lib_string_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_state.h"
+#include "lj_ff.h"
+#include "lj_ctype.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_string
+
+LJLIB_ASM(string_len)		LJLIB_REC(.)
+{
+  lj_lib_checkstr(L, 1);
+  return FFH_RETRY;
+}
+
+LJLIB_ASM(string_byte)		LJLIB_REC(string_range 0)
+{
+  GCstr *s = lj_lib_checkstr(L, 1);
+  int32_t len = (int32_t)s->len;
+  int32_t start = lj_lib_optint(L, 2, 1);
+  int32_t stop = lj_lib_optint(L, 3, start);
+  int32_t n, i;
+  const unsigned char *p;
+  if (stop < 0) stop += len+1;
+  if (start < 0) start += len+1;
+  if (start <= 0) start = 1;
+  if (stop > len) stop = len;
+  if (start > stop) return FFH_RES(0);  /* Empty interval: return no results. */
+  start--;
+  n = stop - start;
+  if ((uint32_t)n > LUAI_MAXCSTACK)
+    lj_err_caller(L, LJ_ERR_STRSLC);
+  lj_state_checkstack(L, (MSize)n);
+  p = (const unsigned char *)strdata(s) + start;
+  for (i = 0; i < n; i++)
+    setintV(L->base + i-1, p[i]);
+  return FFH_RES(n);
+}
+
+LJLIB_ASM(string_char)
+{
+  int i, nargs = cast_int(L->top - L->base);
+  char *buf = lj_str_needbuf(L, &G(L)->tmpbuf, (size_t)nargs);
+  for (i = 1; i <= nargs; i++) {
+    int32_t k = lj_lib_checkint(L, i);
+    if (!checku8(k))
+      lj_err_arg(L, i, LJ_ERR_BADVAL);
+    buf[i-1] = (char)k;
+  }
+  setstrV(L, L->base-1, lj_str_new(L, buf, (size_t)nargs));
+  return FFH_RES(1);
+}
+
+LJLIB_ASM(string_sub)		LJLIB_REC(string_range 1)
+{
+  lj_lib_checkstr(L, 1);
+  lj_lib_checkint(L, 2);
+  setintV(L->base+2, lj_lib_optint(L, 3, -1));
+  return FFH_RETRY;
+}
+
+LJLIB_ASM(string_rep)
+{
+  GCstr *s = lj_lib_checkstr(L, 1);
+  int32_t len = (int32_t)s->len;
+  int32_t k = lj_lib_checkint(L, 2);
+  int64_t tlen = (int64_t)k * len;
+  const char *src;
+  char *buf;
+  if (k <= 0) return FFH_RETRY;
+  if (tlen > LJ_MAX_STR)
+    lj_err_caller(L, LJ_ERR_STROV);
+  buf = lj_str_needbuf(L, &G(L)->tmpbuf, (MSize)tlen);
+  if (len <= 1) return FFH_RETRY;  /* ASM code only needed buffer resize. */
+  src = strdata(s);
+  do {
+    int32_t i = 0;
+    do { *buf++ = src[i++]; } while (i < len);
+  } while (--k > 0);
+  setstrV(L, L->base-1, lj_str_new(L, G(L)->tmpbuf.buf, (size_t)tlen));
+  return FFH_RES(1);
+}
+
+LJLIB_ASM(string_reverse)
+{
+  GCstr *s = lj_lib_checkstr(L, 1);
+  lj_str_needbuf(L, &G(L)->tmpbuf, s->len);
+  return FFH_RETRY;
+}
+LJLIB_ASM_(string_lower)
+LJLIB_ASM_(string_upper)
+
+/* ------------------------------------------------------------------------ */
+
+LJLIB_CF(string_dump)
+{
+  lj_err_caller(L, LJ_ERR_STRDUMP);
+  return 0;  /* unreachable */
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* macro to `unsign' a character */
+#define uchar(c)        ((unsigned char)(c))
+
+#define CAP_UNFINISHED	(-1)
+#define CAP_POSITION	(-2)
+
+typedef struct MatchState {
+  const char *src_init;  /* init of source string */
+  const char *src_end;  /* end (`\0') of source string */
+  lua_State *L;
+  int level;  /* total number of captures (finished or unfinished) */
+  struct {
+    const char *init;
+    ptrdiff_t len;
+  } capture[LUA_MAXCAPTURES];
+} MatchState;
+
+#define L_ESC		'%'
+#define SPECIALS	"^$*+?.([%-"
+
+static int check_capture(MatchState *ms, int l)
+{
+  l -= '1';
+  if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
+    lj_err_caller(ms->L, LJ_ERR_STRCAPI);
+  return l;
+}
+
+static int capture_to_close(MatchState *ms)
+{
+  int level = ms->level;
+  for (level--; level>=0; level--)
+    if (ms->capture[level].len == CAP_UNFINISHED) return level;
+  lj_err_caller(ms->L, LJ_ERR_STRPATC);
+  return 0;  /* unreachable */
+}
+
+static const char *classend(MatchState *ms, const char *p)
+{
+  switch (*p++) {
+  case L_ESC:
+    if (*p == '\0')
+      lj_err_caller(ms->L, LJ_ERR_STRPATE);
+    return p+1;
+  case '[':
+    if (*p == '^') p++;
+    do {  /* look for a `]' */
+      if (*p == '\0')
+	lj_err_caller(ms->L, LJ_ERR_STRPATM);
+      if (*(p++) == L_ESC && *p != '\0')
+	p++;  /* skip escapes (e.g. `%]') */
+    } while (*p != ']');
+    return p+1;
+  default:
+    return p;
+  }
+}
+
+static const unsigned char match_class_map[32] = {
+  0, LJ_CTYPE_ALPHA, 0, LJ_CTYPE_CNTRL, LJ_CTYPE_DIGIT, 0,0,0,0,0,0,0,
+  LJ_CTYPE_LOWER, 0,0,0, LJ_CTYPE_PUNCT, 0,0, LJ_CTYPE_SPACE, 0,
+  LJ_CTYPE_UPPER, 0, LJ_CTYPE_ALNUM, LJ_CTYPE_XDIGIT, 0,0,0,0,0,0,0
+};
+
+static int match_class(int c, int cl)
+{
+  if ((cl & 0xc0) == 0x40) {
+    int t = match_class_map[(cl&0x1f)];
+    if (t) {
+      t = lj_ctype_isa(c, t);
+      return (cl & 0x20) ? t : !t;
+    }
+    if (cl == 'z') return c == 0;
+    if (cl == 'Z') return c != 0;
+  }
+  return (cl == c);
+}
+
+static int matchbracketclass(int c, const char *p, const char *ec)
+{
+  int sig = 1;
+  if (*(p+1) == '^') {
+    sig = 0;
+    p++;  /* skip the `^' */
+  }
+  while (++p < ec) {
+    if (*p == L_ESC) {
+      p++;
+      if (match_class(c, uchar(*p)))
+	return sig;
+    }
+    else if ((*(p+1) == '-') && (p+2 < ec)) {
+      p+=2;
+      if (uchar(*(p-2)) <= c && c <= uchar(*p))
+	return sig;
+    }
+    else if (uchar(*p) == c) return sig;
+  }
+  return !sig;
+}
+
+static int singlematch(int c, const char *p, const char *ep)
+{
+  switch (*p) {
+  case '.': return 1;  /* matches any char */
+  case L_ESC: return match_class(c, uchar(*(p+1)));
+  case '[': return matchbracketclass(c, p, ep-1);
+  default:  return (uchar(*p) == c);
+  }
+}
+
+static const char *match(MatchState *ms, const char *s, const char *p);
+
+static const char *matchbalance(MatchState *ms, const char *s, const char *p)
+{
+  if (*p == 0 || *(p+1) == 0)
+    lj_err_caller(ms->L, LJ_ERR_STRPATU);
+  if (*s != *p) {
+    return NULL;
+  } else {
+    int b = *p;
+    int e = *(p+1);
+    int cont = 1;
+    while (++s < ms->src_end) {
+      if (*s == e) {
+	if (--cont == 0) return s+1;
+      } else if (*s == b) {
+	cont++;
+      }
+    }
+  }
+  return NULL;  /* string ends out of balance */
+}
+
+static const char *max_expand(MatchState *ms, const char *s,
+			      const char *p, const char *ep)
+{
+  ptrdiff_t i = 0;  /* counts maximum expand for item */
+  while ((s+i)<ms->src_end && singlematch(uchar(*(s+i)), p, ep))
+    i++;
+  /* keeps trying to match with the maximum repetitions */
+  while (i>=0) {
+    const char *res = match(ms, (s+i), ep+1);
+    if (res) return res;
+    i--;  /* else didn't match; reduce 1 repetition to try again */
+  }
+  return NULL;
+}
+
+static const char *min_expand(MatchState *ms, const char *s,
+			      const char *p, const char *ep)
+{
+  for (;;) {
+    const char *res = match(ms, s, ep+1);
+    if (res != NULL)
+      return res;
+    else if (s<ms->src_end && singlematch(uchar(*s), p, ep))
+      s++;  /* try with one more repetition */
+    else
+      return NULL;
+  }
+}
+
+static const char *start_capture(MatchState *ms, const char *s,
+				 const char *p, int what)
+{
+  const char *res;
+  int level = ms->level;
+  if (level >= LUA_MAXCAPTURES) lj_err_caller(ms->L, LJ_ERR_STRCAPN);
+  ms->capture[level].init = s;
+  ms->capture[level].len = what;
+  ms->level = level+1;
+  if ((res=match(ms, s, p)) == NULL)  /* match failed? */
+    ms->level--;  /* undo capture */
+  return res;
+}
+
+static const char *end_capture(MatchState *ms, const char *s,
+			       const char *p)
+{
+  int l = capture_to_close(ms);
+  const char *res;
+  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
+  if ((res = match(ms, s, p)) == NULL)  /* match failed? */
+    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
+  return res;
+}
+
+static const char *match_capture(MatchState *ms, const char *s, int l)
+{
+  size_t len;
+  l = check_capture(ms, l);
+  len = (size_t)ms->capture[l].len;
+  if ((size_t)(ms->src_end-s) >= len &&
+      memcmp(ms->capture[l].init, s, len) == 0)
+    return s+len;
+  else
+    return NULL;
+}
+
+static const char *match(MatchState *ms, const char *s, const char *p)
+{
+  init: /* using goto's to optimize tail recursion */
+  switch (*p) {
+  case '(':  /* start capture */
+    if (*(p+1) == ')')  /* position capture? */
+      return start_capture(ms, s, p+2, CAP_POSITION);
+    else
+      return start_capture(ms, s, p+1, CAP_UNFINISHED);
+  case ')':  /* end capture */
+    return end_capture(ms, s, p+1);
+  case L_ESC:
+    switch (*(p+1)) {
+    case 'b':  /* balanced string? */
+      s = matchbalance(ms, s, p+2);
+      if (s == NULL) return NULL;
+      p+=4;
+      goto init;  /* else return match(ms, s, p+4); */
+    case 'f': {  /* frontier? */
+      const char *ep; char previous;
+      p += 2;
+      if (*p != '[')
+	lj_err_caller(ms->L, LJ_ERR_STRPATB);
+      ep = classend(ms, p);  /* points to what is next */
+      previous = (s == ms->src_init) ? '\0' : *(s-1);
+      if (matchbracketclass(uchar(previous), p, ep-1) ||
+	 !matchbracketclass(uchar(*s), p, ep-1)) return NULL;
+      p=ep;
+      goto init;  /* else return match(ms, s, ep); */
+      }
+    default:
+      if (lj_ctype_isdigit(uchar(*(p+1)))) {  /* capture results (%0-%9)? */
+	s = match_capture(ms, s, uchar(*(p+1)));
+	if (s == NULL) return NULL;
+	p+=2;
+	goto init;  /* else return match(ms, s, p+2) */
+      }
+      goto dflt;  /* case default */
+    }
+  case '\0':  /* end of pattern */
+    return s;  /* match succeeded */
+  case '$':
+    if (*(p+1) == '\0')  /* is the `$' the last char in pattern? */
+      return (s == ms->src_end) ? s : NULL;  /* check end of string */
+    else
+      goto dflt;
+  default: dflt: {  /* it is a pattern item */
+    const char *ep = classend(ms, p);  /* points to what is next */
+    int m = s<ms->src_end && singlematch(uchar(*s), p, ep);
+    switch (*ep) {
+    case '?': {  /* optional */
+      const char *res;
+      if (m && ((res=match(ms, s+1, ep+1)) != NULL))
+	return res;
+      p=ep+1;
+      goto init;  /* else return match(ms, s, ep+1); */
+      }
+    case '*':  /* 0 or more repetitions */
+      return max_expand(ms, s, p, ep);
+    case '+':  /* 1 or more repetitions */
+      return (m ? max_expand(ms, s+1, p, ep) : NULL);
+    case '-':  /* 0 or more repetitions (minimum) */
+      return min_expand(ms, s, p, ep);
+    default:
+      if (!m) return NULL;
+      s++; p=ep;
+      goto init;  /* else return match(ms, s+1, ep); */
+    }
+    }
+  }
+}
+
+static const char *lmemfind(const char *s1, size_t l1,
+			    const char *s2, size_t l2)
+{
+  if (l2 == 0) {
+    return s1;  /* empty strings are everywhere */
+  } else if (l2 > l1) {
+    return NULL;  /* avoids a negative `l1' */
+  } else {
+    const char *init;  /* to search for a `*s2' inside `s1' */
+    l2--;  /* 1st char will be checked by `memchr' */
+    l1 = l1-l2;  /* `s2' cannot be found after that */
+    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
+      init++;   /* 1st char is already checked */
+      if (memcmp(init, s2+1, l2) == 0) {
+	return init-1;
+      } else {  /* correct `l1' and `s1' to try again */
+	l1 -= (size_t)(init-s1);
+	s1 = init;
+      }
+    }
+    return NULL;  /* not found */
+  }
+}
+
+static void push_onecapture(MatchState *ms, int i, const char *s, const char *e)
+{
+  if (i >= ms->level) {
+    if (i == 0)  /* ms->level == 0, too */
+      lua_pushlstring(ms->L, s, (size_t)(e - s));  /* add whole match */
+    else
+      lj_err_caller(ms->L, LJ_ERR_STRCAPI);
+  } else {
+    ptrdiff_t l = ms->capture[i].len;
+    if (l == CAP_UNFINISHED) lj_err_caller(ms->L, LJ_ERR_STRCAPU);
+    if (l == CAP_POSITION)
+      lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
+    else
+      lua_pushlstring(ms->L, ms->capture[i].init, (size_t)l);
+  }
+}
+
+static int push_captures(MatchState *ms, const char *s, const char *e)
+{
+  int i;
+  int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
+  luaL_checkstack(ms->L, nlevels, "too many captures");
+  for (i = 0; i < nlevels; i++)
+    push_onecapture(ms, i, s, e);
+  return nlevels;  /* number of strings pushed */
+}
+
+static ptrdiff_t posrelat(ptrdiff_t pos, size_t len)
+{
+  /* relative string position: negative means back from end */
+  if (pos < 0) pos += (ptrdiff_t)len + 1;
+  return (pos >= 0) ? pos : 0;
+}
+
+static int str_find_aux(lua_State *L, int find)
+{
+  size_t l1, l2;
+  const char *s = luaL_checklstring(L, 1, &l1);
+  const char *p = luaL_checklstring(L, 2, &l2);
+  ptrdiff_t init = posrelat(luaL_optinteger(L, 3, 1), l1) - 1;
+  if (init < 0)
+    init = 0;
+  else if ((size_t)(init) > l1)
+    init = (ptrdiff_t)l1;
+  if (find && (lua_toboolean(L, 4) ||  /* explicit request? */
+      strpbrk(p, SPECIALS) == NULL)) {  /* or no special characters? */
+    /* do a plain search */
+    const char *s2 = lmemfind(s+init, l1-(size_t)init, p, l2);
+    if (s2) {
+      lua_pushinteger(L, s2-s+1);
+      lua_pushinteger(L, s2-s+(ptrdiff_t)l2);
+      return 2;
+    }
+  } else {
+    MatchState ms;
+    int anchor = (*p == '^') ? (p++, 1) : 0;
+    const char *s1=s+init;
+    ms.L = L;
+    ms.src_init = s;
+    ms.src_end = s+l1;
+    do {
+      const char *res;
+      ms.level = 0;
+      if ((res=match(&ms, s1, p)) != NULL) {
+	if (find) {
+	  lua_pushinteger(L, s1-s+1);  /* start */
+	  lua_pushinteger(L, res-s);   /* end */
+	  return push_captures(&ms, NULL, 0) + 2;
+	} else {
+	  return push_captures(&ms, s1, res);
+	}
+      }
+    } while (s1++ < ms.src_end && !anchor);
+  }
+  lua_pushnil(L);  /* not found */
+  return 1;
+}
+
+LJLIB_CF(string_find)
+{
+  return str_find_aux(L, 1);
+}
+
+LJLIB_CF(string_match)
+{
+  return str_find_aux(L, 0);
+}
+
+LJLIB_NOREG LJLIB_CF(string_gmatch_aux)
+{
+  const char *p = strVdata(lj_lib_upvalue(L, 2));
+  GCstr *str = strV(lj_lib_upvalue(L, 1));
+  const char *s = strdata(str);
+  TValue *tvpos = lj_lib_upvalue(L, 3);
+  const char *src = s + tvpos->u32.lo;
+  MatchState ms;
+  ms.L = L;
+  ms.src_init = s;
+  ms.src_end = s + str->len;
+  for (; src <= ms.src_end; src++) {
+    const char *e;
+    ms.level = 0;
+    if ((e = match(&ms, src, p)) != NULL) {
+      int32_t pos = (int32_t)(e - s);
+      if (e == src) pos++;  /* Ensure progress for empty match. */
+      tvpos->u32.lo = (uint32_t)pos;
+      return push_captures(&ms, src, e);
+    }
+  }
+  return 0;  /* not found */
+}
+
+LJLIB_CF(string_gmatch)
+{
+  lj_lib_checkstr(L, 1);
+  lj_lib_checkstr(L, 2);
+  L->top = L->base+3;
+  (L->top-1)->u64 = 0;
+  lua_pushcclosure(L, lj_cf_string_gmatch_aux, 3);
+  funcV(L->top-1)->c.ffid = FF_string_gmatch_aux;
+  return 1;
+}
+
+static void add_s(MatchState *ms, luaL_Buffer *b, const char *s, const char *e)
+{
+  size_t l, i;
+  const char *news = lua_tolstring(ms->L, 3, &l);
+  for (i = 0; i < l; i++) {
+    if (news[i] != L_ESC) {
+      luaL_addchar(b, news[i]);
+    } else {
+      i++;  /* skip ESC */
+      if (!lj_ctype_isdigit(uchar(news[i]))) {
+	luaL_addchar(b, news[i]);
+      } else if (news[i] == '0') {
+	luaL_addlstring(b, s, (size_t)(e - s));
+      } else {
+	push_onecapture(ms, news[i] - '1', s, e);
+	luaL_addvalue(b);  /* add capture to accumulated result */
+      }
+    }
+  }
+}
+
+static void add_value(MatchState *ms, luaL_Buffer *b,
+		      const char *s, const char *e)
+{
+  lua_State *L = ms->L;
+  switch (lua_type(L, 3)) {
+    case LUA_TNUMBER:
+    case LUA_TSTRING: {
+      add_s(ms, b, s, e);
+      return;
+    }
+    case LUA_TFUNCTION: {
+      int n;
+      lua_pushvalue(L, 3);
+      n = push_captures(ms, s, e);
+      lua_call(L, n, 1);
+      break;
+    }
+    case LUA_TTABLE: {
+      push_onecapture(ms, 0, s, e);
+      lua_gettable(L, 3);
+      break;
+    }
+  }
+  if (!lua_toboolean(L, -1)) {  /* nil or false? */
+    lua_pop(L, 1);
+    lua_pushlstring(L, s, (size_t)(e - s));  /* keep original text */
+  } else if (!lua_isstring(L, -1)) {
+    lj_err_callerv(L, LJ_ERR_STRGSRV, luaL_typename(L, -1));
+  }
+  luaL_addvalue(b);  /* add result to accumulator */
+}
+
+LJLIB_CF(string_gsub)
+{
+  size_t srcl;
+  const char *src = luaL_checklstring(L, 1, &srcl);
+  const char *p = luaL_checkstring(L, 2);
+  int  tr = lua_type(L, 3);
+  int max_s = luaL_optint(L, 4, (int)(srcl+1));
+  int anchor = (*p == '^') ? (p++, 1) : 0;
+  int n = 0;
+  MatchState ms;
+  luaL_Buffer b;
+  if (!(tr == LUA_TNUMBER || tr == LUA_TSTRING ||
+	tr == LUA_TFUNCTION || tr == LUA_TTABLE))
+    lj_err_arg(L, 3, LJ_ERR_NOSFT);
+  luaL_buffinit(L, &b);
+  ms.L = L;
+  ms.src_init = src;
+  ms.src_end = src+srcl;
+  while (n < max_s) {
+    const char *e;
+    ms.level = 0;
+    e = match(&ms, src, p);
+    if (e) {
+      n++;
+      add_value(&ms, &b, src, e);
+    }
+    if (e && e>src) /* non empty match? */
+      src = e;  /* skip it */
+    else if (src < ms.src_end)
+      luaL_addchar(&b, *src++);
+    else
+      break;
+    if (anchor)
+      break;
+  }
+  luaL_addlstring(&b, src, (size_t)(ms.src_end-src));
+  luaL_pushresult(&b);
+  lua_pushinteger(L, n);  /* number of substitutions */
+  return 2;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
+#define MAX_FMTITEM	512
+/* valid flags in a format specification */
+#define FMT_FLAGS	"-+ #0"
+/*
+** maximum size of each format specification (such as '%-099.99d')
+** (+10 accounts for %99.99x plus margin of error)
+*/
+#define MAX_FMTSPEC	(sizeof(FMT_FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
+
+static void addquoted(lua_State *L, luaL_Buffer *b, int arg)
+{
+  GCstr *str = lj_lib_checkstr(L, arg);
+  int32_t len = (int32_t)str->len;
+  const char *s = strdata(str);
+  luaL_addchar(b, '"');
+  while (len--) {
+    switch (*s) {
+    case '"': case '\\': case '\n':
+      luaL_addchar(b, '\\');
+      luaL_addchar(b, *s);
+      break;
+    case '\r':
+      luaL_addlstring(b, "\\r", 2);
+      break;
+    case '\0':
+      luaL_addlstring(b, "\\000", 4);
+      break;
+    default:
+      luaL_addchar(b, *s);
+      break;
+    }
+    s++;
+  }
+  luaL_addchar(b, '"');
+}
+
+static const char *scanformat(lua_State *L, const char *strfrmt, char *form)
+{
+  const char *p = strfrmt;
+  while (*p != '\0' && strchr(FMT_FLAGS, *p) != NULL) p++;  /* skip flags */
+  if ((size_t)(p - strfrmt) >= sizeof(FMT_FLAGS))
+    lj_err_caller(L, LJ_ERR_STRFMTR);
+  if (lj_ctype_isdigit(uchar(*p))) p++;  /* skip width */
+  if (lj_ctype_isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  if (*p == '.') {
+    p++;
+    if (lj_ctype_isdigit(uchar(*p))) p++;  /* skip precision */
+    if (lj_ctype_isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  }
+  if (lj_ctype_isdigit(uchar(*p)))
+    lj_err_caller(L, LJ_ERR_STRFMTW);
+  *(form++) = '%';
+  strncpy(form, strfrmt, (size_t)(p - strfrmt + 1));
+  form += p - strfrmt + 1;
+  *form = '\0';
+  return p;
+}
+
+static void addintlen(char *form)
+{
+  size_t l = strlen(form);
+  char spec = form[l - 1];
+  strcpy(form + l - 1, LUA_INTFRMLEN);
+  form[l + sizeof(LUA_INTFRMLEN) - 2] = spec;
+  form[l + sizeof(LUA_INTFRMLEN) - 1] = '\0';
+}
+
+LJLIB_CF(string_format)
+{
+  int arg = 1;
+  GCstr *fmt = lj_lib_checkstr(L, arg);
+  const char *strfrmt = strdata(fmt);
+  const char *strfrmt_end = strfrmt + fmt->len;
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while (strfrmt < strfrmt_end) {
+    if (*strfrmt != L_ESC) {
+      luaL_addchar(&b, *strfrmt++);
+    } else if (*++strfrmt == L_ESC) {
+      luaL_addchar(&b, *strfrmt++);  /* %% */
+    } else { /* format item */
+      char form[MAX_FMTSPEC];  /* to store the format (`%...') */
+      char buff[MAX_FMTITEM];  /* to store the formatted item */
+      arg++;
+      strfrmt = scanformat(L, strfrmt, form);
+      switch (*strfrmt++) {
+      case 'c':
+	sprintf(buff, form, lj_lib_checkint(L, arg));
+	break;
+      case 'd':  case 'i':
+	addintlen(form);
+	sprintf(buff, form, (LUA_INTFRM_T)lj_lib_checknum(L, arg));
+	break;
+      case 'o':  case 'u':  case 'x':  case 'X':
+	addintlen(form);
+	sprintf(buff, form, (unsigned LUA_INTFRM_T)lj_lib_checknum(L, arg));
+	break;
+      case 'e':  case 'E': case 'f': case 'g': case 'G':
+	sprintf(buff, form, (double)lj_lib_checknum(L, arg));
+	break;
+      case 'q':
+	addquoted(L, &b, arg);
+	continue;
+      case 'p':
+	lj_str_pushf(L, "%p", lua_topointer(L, arg));
+	luaL_addvalue(&b);
+	continue;
+      case 's': {
+	GCstr *str = lj_lib_checkstr(L, arg);
+	if (!strchr(form, '.') && str->len >= 100) {
+	  /* no precision and string is too long to be formatted;
+	     keep original string */
+	  setstrV(L, L->top++, str);
+	  luaL_addvalue(&b);
+	  continue;
+	}
+	sprintf(buff, form, strdata(str));
+	break;
+	}
+      default:
+	lj_err_callerv(L, LJ_ERR_STRFMTO, *(strfrmt -1));
+	break;
+      }
+      luaL_addlstring(&b, buff, strlen(buff));
+    }
+  }
+  luaL_pushresult(&b);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_string(lua_State *L)
+{
+  GCtab *mt;
+  LJ_LIB_REG(L, string);
+#if defined(LUA_COMPAT_GFIND)
+  lua_getfield(L, -1, "gmatch");
+  lua_setfield(L, -2, "gfind");
+#endif
+  mt = lj_tab_new(L, 0, 1);
+  /* NOBARRIER: G(L)->mmname[] is a GC root. */
+  setgcref(G(L)->basemt[~LJ_TSTR], obj2gco(mt));
+  settabV(L, lj_tab_setstr(L, mt, strref(G(L)->mmname[MM_index])),
+	      tabV(L->top-1));
+  mt->nomm = cast_byte(~(1u<<MM_index));
+  return 1;
+}
+

+ 276 - 0
src/lib_table.c

@@ -0,0 +1,276 @@
+/*
+** Table library.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lib_table_c
+#define LUA_LIB
+
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_tab.h"
+#include "lj_lib.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define LJLIB_MODULE_table
+
+LJLIB_CF(table_foreachi)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  GCfunc *func = lj_lib_checkfunc(L, 2);
+  MSize i, n = lj_tab_len(t);
+  for (i = 1; i <= n; i++) {
+    cTValue *val;
+    setfuncV(L, L->top, func);
+    setintV(L->top+1, i);
+    val = lj_tab_getint(t, (int32_t)i);
+    if (val) { copyTV(L, L->top+2, val); } else { setnilV(L->top+2); }
+    L->top += 3;
+    lua_call(L, 2, 1);
+    if (!tvisnil(L->top-1))
+      return 1;
+    L->top--;
+  }
+  return 0;
+}
+
+LJLIB_CF(table_foreach)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  GCfunc *func = lj_lib_checkfunc(L, 2);
+  L->top = L->base+3;
+  setnilV(L->top-1);
+  while (lj_tab_next(L, t, L->top-1)) {
+    copyTV(L, L->top+2, L->top);
+    copyTV(L, L->top+1, L->top-1);
+    setfuncV(L, L->top, func);
+    L->top += 3;
+    lua_call(L, 2, 1);
+    if (!tvisnil(L->top-1))
+      return 1;
+    L->top--;
+  }
+  return 0;
+}
+
+LJLIB_ASM(table_getn)		LJLIB_REC(.)
+{
+  lj_lib_checktab(L, 1);
+  return FFH_UNREACHABLE;
+}
+
+LJLIB_CF(table_maxn)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  TValue *array = tvref(t->array);
+  Node *node;
+  lua_Number m = 0;
+  uint32_t i;
+  for (i = 0; i < t->asize; i++)
+    if (!tvisnil(&array[i])) {
+      m = (lua_Number)i;
+      break;
+    }
+  node = noderef(t->node);
+  for (i = 0; i <= t->hmask; i++)
+    if (tvisnum(&node[i].key) && numV(&node[i].key) > m)
+      m = numV(&node[i].key);
+  setnumV(L->top-1, m);
+  return 1;
+}
+
+LJLIB_CF(table_insert)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  int32_t n, i = (int32_t)lj_tab_len(t) + 1;
+  int nargs = (int)((char *)L->top - (char *)L->base);
+  if (nargs != 2*sizeof(TValue)) {
+    if (nargs != 3*sizeof(TValue))
+      lj_err_caller(L, LJ_ERR_TABINS);
+    /* NOBARRIER: This just moves existing elements around. */
+    for (n = lj_lib_checkint(L, 2); i > n; i--) {
+      /* The set may invalidate the get pointer, so need to do it first! */
+      TValue *dst = lj_tab_setint(L, t, i);
+      cTValue *src = lj_tab_getint(t, i-1);
+      if (src) {
+	copyTV(L, dst, src);
+      } else {
+	setnilV(dst);
+      }
+    }
+    i = n;
+  }
+  {
+    TValue *dst = lj_tab_setint(L, t, i);
+    copyTV(L, dst, L->top-1);
+    lj_gc_barriert(L, t, dst);
+  }
+  return 0;
+}
+
+LJLIB_CF(table_remove)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  int32_t e = (int32_t)lj_tab_len(t);
+  int32_t pos = lj_lib_optint(L, 2, e);
+  if (!(1 <= pos && pos <= e))  /* position is outside bounds? */
+    return 0;  /* nothing to remove */
+  lua_rawgeti(L, 1, pos);
+  /* NOBARRIER: This just moves existing elements around. */
+  for (; pos < e; pos++) {
+    cTValue *src = lj_tab_getint(t, pos+1);
+    TValue *dst = lj_tab_setint(L, t, pos);
+    if (src) {
+      copyTV(L, dst, src);
+    } else {
+      setnilV(dst);
+    }
+  }
+  setnilV(lj_tab_setint(L, t, e));
+  return 1;
+}
+
+LJLIB_CF(table_concat)
+{
+  luaL_Buffer b;
+  GCtab *t = lj_lib_checktab(L, 1);
+  GCstr *sep = lj_lib_optstr(L, 2);
+  MSize seplen = sep ? sep->len : 0;
+  int32_t i = lj_lib_optint(L, 3, 1);
+  int32_t e = L->base+3 < L->top ? lj_lib_checkint(L, 4) :
+				   (int32_t)lj_tab_len(t);
+  luaL_buffinit(L, &b);
+  if (i <= e) {
+    for (;;) {
+      cTValue *o;
+      lua_rawgeti(L, 1, i);
+      o = L->top-1;
+      if (!(tvisstr(o) || tvisnum(o)))
+	lj_err_callerv(L, LJ_ERR_TABCAT, typename(o), i);
+      luaL_addvalue(&b);
+      if (i++ == e) break;
+      if (seplen)
+	luaL_addlstring(&b, strdata(sep), seplen);
+    }
+  }
+  luaL_pushresult(&b);
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static void set2(lua_State *L, int i, int j)
+{
+  lua_rawseti(L, 1, i);
+  lua_rawseti(L, 1, j);
+}
+
+static int sort_comp(lua_State *L, int a, int b)
+{
+  if (!lua_isnil(L, 2)) {  /* function? */
+    int res;
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, a-1);  /* -1 to compensate function */
+    lua_pushvalue(L, b-2);  /* -2 to compensate function and `a' */
+    lua_call(L, 2, 1);
+    res = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+    return res;
+  } else {  /* a < b? */
+    return lua_lessthan(L, a, b);
+  }
+}
+
+static void auxsort(lua_State *L, int l, int u)
+{
+  while (l < u) {  /* for tail recursion */
+    int i, j;
+    /* sort elements a[l], a[(l+u)/2] and a[u] */
+    lua_rawgeti(L, 1, l);
+    lua_rawgeti(L, 1, u);
+    if (sort_comp(L, -1, -2))  /* a[u] < a[l]? */
+      set2(L, l, u);  /* swap a[l] - a[u] */
+    else
+      lua_pop(L, 2);
+    if (u-l == 1) break;  /* only 2 elements */
+    i = (l+u)/2;
+    lua_rawgeti(L, 1, i);
+    lua_rawgeti(L, 1, l);
+    if (sort_comp(L, -2, -1)) {  /* a[i]<a[l]? */
+      set2(L, i, l);
+    } else {
+      lua_pop(L, 1);  /* remove a[l] */
+      lua_rawgeti(L, 1, u);
+      if (sort_comp(L, -1, -2))  /* a[u]<a[i]? */
+	set2(L, i, u);
+      else
+	lua_pop(L, 2);
+    }
+    if (u-l == 2) break;  /* only 3 elements */
+    lua_rawgeti(L, 1, i);  /* Pivot */
+    lua_pushvalue(L, -1);
+    lua_rawgeti(L, 1, u-1);
+    set2(L, i, u-1);
+    /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
+    i = l; j = u-1;
+    for (;;) {  /* invariant: a[l..i] <= P <= a[j..u] */
+      /* repeat ++i until a[i] >= P */
+      while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
+	if (i>u) lj_err_caller(L, LJ_ERR_TABSORT);
+	lua_pop(L, 1);  /* remove a[i] */
+      }
+      /* repeat --j until a[j] <= P */
+      while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
+	if (j<l) lj_err_caller(L, LJ_ERR_TABSORT);
+	lua_pop(L, 1);  /* remove a[j] */
+      }
+      if (j<i) {
+	lua_pop(L, 3);  /* pop pivot, a[i], a[j] */
+	break;
+      }
+      set2(L, i, j);
+    }
+    lua_rawgeti(L, 1, u-1);
+    lua_rawgeti(L, 1, i);
+    set2(L, u-1, i);  /* swap pivot (a[u-1]) with a[i] */
+    /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
+    /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
+    if (i-l < u-i) {
+      j=l; i=i-1; l=i+2;
+    } else {
+      j=i+1; i=u; u=j-2;
+    }
+    auxsort(L, j, i);  /* call recursively the smaller one */
+  }  /* repeat the routine for the larger one */
+}
+
+LJLIB_CF(table_sort)
+{
+  GCtab *t = lj_lib_checktab(L, 1);
+  int32_t n = (int32_t)lj_tab_len(t);
+  lua_settop(L, 2);
+  if (!tvisnil(L->base+1))
+    lj_lib_checkfunc(L, 2);
+  auxsort(L, 1, n);
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+#include "lj_libdef.h"
+
+LUALIB_API int luaopen_table(lua_State *L)
+{
+  LJ_LIB_REG(L, table);
+  return 1;
+}
+

+ 6 - 0
src/lj.supp

@@ -0,0 +1,6 @@
+# Valgrind suppression file for LuaJIT 2.x.
+{
+   Optimized string compare
+   Memcheck:Addr4
+   fun:lj_str_cmp
+}

+ 1232 - 0
src/lj_alloc.c

@@ -0,0 +1,1232 @@
+/*
+** Bundled memory allocator.
+**
+** Beware: this is a HEAVILY CUSTOMIZED version of dlmalloc.
+** The original bears the following remark:
+**
+**   This is a version (aka dlmalloc) of malloc/free/realloc written by
+**   Doug Lea and released to the public domain, as explained at
+**   http://creativecommons.org/licenses/publicdomain.
+**
+**   * Version pre-2.8.4 Wed Mar 29 19:46:29 2006    (dl at gee)
+**
+** No additional copyright is claimed over the customizations.
+** Please do NOT bother the original author about this version here!
+**
+** If you want to use dlmalloc in another project, you should get
+** the original from: ftp://gee.cs.oswego.edu/pub/misc/
+** For thread-safe derivatives, take a look at:
+** - ptmalloc: http://www.malloc.de/
+** - nedmalloc: http://www.nedprod.com/programs/portable/nedmalloc/
+*/
+
+#define lj_alloc_c
+#define LUA_CORE
+
+/* To get the mremap prototype. Must be defind before any system includes. */
+#if defined(__linux__) && !defined(_GNU_SOURCE)
+#define _GNU_SOURCE
+#endif
+
+#include "lj_def.h"
+#include "lj_arch.h"
+#include "lj_alloc.h"
+
+#ifndef LUAJIT_USE_SYSMALLOC
+
+#define MAX_SIZE_T		(~(size_t)0)
+#define MALLOC_ALIGNMENT	((size_t)8U)
+
+#define DEFAULT_GRANULARITY	((size_t)128U * (size_t)1024U)
+#define DEFAULT_TRIM_THRESHOLD	((size_t)2U * (size_t)1024U * (size_t)1024U)
+#define DEFAULT_MMAP_THRESHOLD	((size_t)128U * (size_t)1024U)
+#define MAX_RELEASE_CHECK_RATE	255
+
+/* ------------------- size_t and alignment properties -------------------- */
+
+/* The byte and bit size of a size_t */
+#define SIZE_T_SIZE		(sizeof(size_t))
+#define SIZE_T_BITSIZE		(sizeof(size_t) << 3)
+
+/* Some constants coerced to size_t */
+/* Annoying but necessary to avoid errors on some platforms */
+#define SIZE_T_ZERO		((size_t)0)
+#define SIZE_T_ONE		((size_t)1)
+#define SIZE_T_TWO		((size_t)2)
+#define TWO_SIZE_T_SIZES	(SIZE_T_SIZE<<1)
+#define FOUR_SIZE_T_SIZES	(SIZE_T_SIZE<<2)
+#define SIX_SIZE_T_SIZES	(FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
+
+/* The bit mask value corresponding to MALLOC_ALIGNMENT */
+#define CHUNK_ALIGN_MASK	(MALLOC_ALIGNMENT - SIZE_T_ONE)
+
+/* the number of bytes to offset an address to align it */
+#define align_offset(A)\
+ ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
+  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
+
+/* -------------------------- MMAP support ------------------------------- */
+
+#define MFAIL			((void *)(MAX_SIZE_T))
+#define CMFAIL			((char *)(MFAIL)) /* defined for convenience */
+
+#define IS_DIRECT_BIT		(SIZE_T_ONE)
+
+#ifdef LUA_USE_WIN
+
+#if LJ_64
+#error "missing support for WIN64 to allocate in lower 2G"
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+/* Win32 MMAP via VirtualAlloc */
+static LJ_AINLINE void *CALL_MMAP(size_t size)
+{
+  void *ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+static LJ_AINLINE void *DIRECT_MMAP(size_t size)
+{
+  void *ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+			   PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesed segments */
+static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size)
+{
+  MEMORY_BASIC_INFORMATION minfo;
+  char *cptr = (char *)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+	minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#else
+
+#include <sys/mman.h>
+
+#define MMAP_PROT		(PROT_READ|PROT_WRITE)
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS		MAP_ANON
+#endif /* MAP_ANON */
+
+#if LJ_64
+#define MMAP_FLAGS		(MAP_PRIVATE|MAP_ANONYMOUS|MAP_32BIT)
+#else
+#define MMAP_FLAGS		(MAP_PRIVATE|MAP_ANONYMOUS)
+#endif
+
+#define CALL_MMAP(s)		mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
+#define DIRECT_MMAP(s)		CALL_MMAP(s)
+#define CALL_MUNMAP(a, s)	munmap((a), (s))
+
+#ifdef __linux__
+/* Need to define _GNU_SOURCE to get the mremap prototype. */
+#define CALL_MREMAP(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
+#endif
+
+#endif
+
+#ifndef CALL_MREMAP
+#define CALL_MREMAP(addr, osz, nsz, mv) ((void)osz, MFAIL)
+#endif
+
+/* -----------------------  Chunk representations ------------------------ */
+
+struct malloc_chunk {
+  size_t               prev_foot;  /* Size of previous chunk (if free).  */
+  size_t               head;       /* Size and inuse bits. */
+  struct malloc_chunk *fd;         /* double links -- used only if free. */
+  struct malloc_chunk *bk;
+};
+
+typedef struct malloc_chunk  mchunk;
+typedef struct malloc_chunk *mchunkptr;
+typedef struct malloc_chunk *sbinptr;  /* The type of bins of chunks */
+typedef unsigned int bindex_t;         /* Described below */
+typedef unsigned int binmap_t;         /* Described below */
+typedef unsigned int flag_t;           /* The type of various bit flag sets */
+
+/* ------------------- Chunks sizes and alignments ----------------------- */
+
+#define MCHUNK_SIZE		(sizeof(mchunk))
+
+#define CHUNK_OVERHEAD		(SIZE_T_SIZE)
+
+/* Direct chunks need a second word of overhead ... */
+#define DIRECT_CHUNK_OVERHEAD	(TWO_SIZE_T_SIZES)
+/* ... and additional padding for fake next-chunk at foot */
+#define DIRECT_FOOT_PAD		(FOUR_SIZE_T_SIZES)
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MIN_CHUNK_SIZE\
+  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(p)		((void *)((char *)(p) + TWO_SIZE_T_SIZES))
+#define mem2chunk(mem)		((mchunkptr)((char *)(mem) - TWO_SIZE_T_SIZES))
+/* chunk associated with aligned address A */
+#define align_as_chunk(A)	(mchunkptr)((A) + align_offset(chunk2mem(A)))
+
+/* Bounds on request (not chunk) sizes. */
+#define MAX_REQUEST		((~MIN_CHUNK_SIZE+1) << 2)
+#define MIN_REQUEST		(MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
+
+/* pad request bytes into a usable size */
+#define pad_request(req) \
+   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* pad request, checking for minimum (but not maximum) */
+#define request2size(req) \
+  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
+
+/* ------------------ Operations on head and foot fields ----------------- */
+
+#define PINUSE_BIT		(SIZE_T_ONE)
+#define CINUSE_BIT		(SIZE_T_TWO)
+#define INUSE_BITS		(PINUSE_BIT|CINUSE_BIT)
+
+/* Head value for fenceposts */
+#define FENCEPOST_HEAD		(INUSE_BITS|SIZE_T_SIZE)
+
+/* extraction of fields from head words */
+#define cinuse(p)		((p)->head & CINUSE_BIT)
+#define pinuse(p)		((p)->head & PINUSE_BIT)
+#define chunksize(p)		((p)->head & ~(INUSE_BITS))
+
+#define clear_pinuse(p)		((p)->head &= ~PINUSE_BIT)
+#define clear_cinuse(p)		((p)->head &= ~CINUSE_BIT)
+
+/* Treat space at ptr +/- offset as a chunk */
+#define chunk_plus_offset(p, s)		((mchunkptr)(((char *)(p)) + (s)))
+#define chunk_minus_offset(p, s)	((mchunkptr)(((char *)(p)) - (s)))
+
+/* Ptr to next or previous physical malloc_chunk. */
+#define next_chunk(p)	((mchunkptr)(((char *)(p)) + ((p)->head & ~INUSE_BITS)))
+#define prev_chunk(p)	((mchunkptr)(((char *)(p)) - ((p)->prev_foot) ))
+
+/* extract next chunk's pinuse bit */
+#define next_pinuse(p)	((next_chunk(p)->head) & PINUSE_BIT)
+
+/* Get/set size at footer */
+#define get_foot(p, s)	(((mchunkptr)((char *)(p) + (s)))->prev_foot)
+#define set_foot(p, s)	(((mchunkptr)((char *)(p) + (s)))->prev_foot = (s))
+
+/* Set size, pinuse bit, and foot */
+#define set_size_and_pinuse_of_free_chunk(p, s)\
+  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
+
+/* Set size, pinuse bit, foot, and clear next pinuse */
+#define set_free_with_pinuse(p, s, n)\
+  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
+
+#define is_direct(p)\
+  (!((p)->head & PINUSE_BIT) && ((p)->prev_foot & IS_DIRECT_BIT))
+
+/* Get the internal overhead associated with chunk p */
+#define overhead_for(p)\
+ (is_direct(p)? DIRECT_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
+
+/* ---------------------- Overlaid data structures ----------------------- */
+
+struct malloc_tree_chunk {
+  /* The first four fields must be compatible with malloc_chunk */
+  size_t                    prev_foot;
+  size_t                    head;
+  struct malloc_tree_chunk *fd;
+  struct malloc_tree_chunk *bk;
+
+  struct malloc_tree_chunk *child[2];
+  struct malloc_tree_chunk *parent;
+  bindex_t                  index;
+};
+
+typedef struct malloc_tree_chunk  tchunk;
+typedef struct malloc_tree_chunk *tchunkptr;
+typedef struct malloc_tree_chunk *tbinptr; /* The type of bins of trees */
+
+/* A little helper macro for trees */
+#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
+
+/* ----------------------------- Segments -------------------------------- */
+
+struct malloc_segment {
+  char        *base;             /* base address */
+  size_t       size;             /* allocated size */
+  struct malloc_segment *next;   /* ptr to next segment */
+};
+
+typedef struct malloc_segment  msegment;
+typedef struct malloc_segment *msegmentptr;
+
+/* ---------------------------- malloc_state ----------------------------- */
+
+/* Bin types, widths and sizes */
+#define NSMALLBINS		(32U)
+#define NTREEBINS		(32U)
+#define SMALLBIN_SHIFT		(3U)
+#define SMALLBIN_WIDTH		(SIZE_T_ONE << SMALLBIN_SHIFT)
+#define TREEBIN_SHIFT		(8U)
+#define MIN_LARGE_SIZE		(SIZE_T_ONE << TREEBIN_SHIFT)
+#define MAX_SMALL_SIZE		(MIN_LARGE_SIZE - SIZE_T_ONE)
+#define MAX_SMALL_REQUEST  (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
+
+struct malloc_state {
+  binmap_t   smallmap;
+  binmap_t   treemap;
+  size_t     dvsize;
+  size_t     topsize;
+  mchunkptr  dv;
+  mchunkptr  top;
+  size_t     trim_check;
+  size_t     release_checks;
+  mchunkptr  smallbins[(NSMALLBINS+1)*2];
+  tbinptr    treebins[NTREEBINS];
+  msegment   seg;
+};
+
+typedef struct malloc_state *mstate;
+
+#define is_initialized(M)	((M)->top != 0)
+
+/* -------------------------- system alloc setup ------------------------- */
+
+/* page-align a size */
+#define page_align(S)\
+ (((S) + (LJ_PAGESIZE - SIZE_T_ONE)) & ~(LJ_PAGESIZE - SIZE_T_ONE))
+
+/* granularity-align a size */
+#define granularity_align(S)\
+  (((S) + (DEFAULT_GRANULARITY - SIZE_T_ONE))\
+   & ~(DEFAULT_GRANULARITY - SIZE_T_ONE))
+
+#ifdef LUA_USE_WIN
+#define mmap_align(S)	granularity_align(S)
+#else
+#define mmap_align(S)	page_align(S)
+#endif
+
+/*  True if segment S holds address A */
+#define segment_holds(S, A)\
+  ((char *)(A) >= S->base && (char *)(A) < S->base + S->size)
+
+/* Return segment holding given address */
+static msegmentptr segment_holding(mstate m, char *addr)
+{
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if (addr >= sp->base && addr < sp->base + sp->size)
+      return sp;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/* Return true if segment contains a segment link */
+static int has_segment_link(mstate m, msegmentptr ss)
+{
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if ((char *)sp >= ss->base && (char *)sp < ss->base + ss->size)
+      return 1;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/*
+  TOP_FOOT_SIZE is padding at the end of a segment, including space
+  that may be needed to place segment records and fenceposts when new
+  noncontiguous segments are added.
+*/
+#define TOP_FOOT_SIZE\
+  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
+
+/* ---------------------------- Indexing Bins ---------------------------- */
+
+#define is_small(s)		(((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
+#define small_index(s)		((s)  >> SMALLBIN_SHIFT)
+#define small_index2size(i)	((i)  << SMALLBIN_SHIFT)
+#define MIN_SMALL_INDEX		(small_index(MIN_CHUNK_SIZE))
+
+/* addressing by index. See above about smallbin repositioning */
+#define smallbin_at(M, i)	((sbinptr)((char *)&((M)->smallbins[(i)<<1])))
+#define treebin_at(M,i)		(&((M)->treebins[i]))
+
+/* assign tree index for size S to variable I */
+#define compute_tree_index(S, I)\
+{\
+  unsigned int X = S >> TREEBIN_SHIFT;\
+  if (X == 0) {\
+    I = 0;\
+  } else if (X > 0xFFFF) {\
+    I = NTREEBINS-1;\
+  } else {\
+    unsigned int K = lj_fls(X);\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+/* Bit representing maximum resolved size in a treebin at i */
+#define bit_for_tree_index(i) \
+   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
+
+/* Shift placing maximum resolved bit in a treebin at i as sign bit */
+#define leftshift_for_tree_index(i) \
+   ((i == NTREEBINS-1)? 0 : \
+    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
+
+/* The size of the smallest chunk held in bin with index i */
+#define minsize_for_tree_index(i) \
+   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
+   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
+
+/* ------------------------ Operations on bin maps ----------------------- */
+
+/* bit corresponding to given index */
+#define idx2bit(i)		((binmap_t)(1) << (i))
+
+/* Mark/Clear bits with given index */
+#define mark_smallmap(M,i)	((M)->smallmap |=  idx2bit(i))
+#define clear_smallmap(M,i)	((M)->smallmap &= ~idx2bit(i))
+#define smallmap_is_marked(M,i)	((M)->smallmap &   idx2bit(i))
+
+#define mark_treemap(M,i)	((M)->treemap  |=  idx2bit(i))
+#define clear_treemap(M,i)	((M)->treemap  &= ~idx2bit(i))
+#define treemap_is_marked(M,i)	((M)->treemap  &   idx2bit(i))
+
+/* mask with all bits to left of least bit of x on */
+#define left_bits(x)		((x<<1) | (~(x<<1)+1))
+
+/* Set cinuse bit and pinuse bit of next chunk */
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  ((mchunkptr)(((char *)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  ((mchunkptr)(((char *)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set size, cinuse and pinuse bit of this chunk */
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
+
+/* ----------------------- Operations on smallbins ----------------------- */
+
+/* Link a free chunk into a smallbin  */
+#define insert_small_chunk(M, P, S) {\
+  bindex_t I = small_index(S);\
+  mchunkptr B = smallbin_at(M, I);\
+  mchunkptr F = B;\
+  if (!smallmap_is_marked(M, I))\
+    mark_smallmap(M, I);\
+  else\
+    F = B->fd;\
+  B->fd = P;\
+  F->bk = P;\
+  P->fd = F;\
+  P->bk = B;\
+}
+
+/* Unlink a chunk from a smallbin  */
+#define unlink_small_chunk(M, P, S) {\
+  mchunkptr F = P->fd;\
+  mchunkptr B = P->bk;\
+  bindex_t I = small_index(S);\
+  if (F == B) {\
+    clear_smallmap(M, I);\
+  } else {\
+    F->bk = B;\
+    B->fd = F;\
+  }\
+}
+
+/* Unlink the first chunk from a smallbin */
+#define unlink_first_small_chunk(M, B, P, I) {\
+  mchunkptr F = P->fd;\
+  if (B == F) {\
+    clear_smallmap(M, I);\
+  } else {\
+    B->fd = F;\
+    F->bk = B;\
+  }\
+}
+
+/* Replace dv node, binning the old one */
+/* Used only when dvsize known to be small */
+#define replace_dv(M, P, S) {\
+  size_t DVS = M->dvsize;\
+  if (DVS != 0) {\
+    mchunkptr DV = M->dv;\
+    insert_small_chunk(M, DV, DVS);\
+  }\
+  M->dvsize = S;\
+  M->dv = P;\
+}
+
+/* ------------------------- Operations on trees ------------------------- */
+
+/* Insert chunk into tree */
+#define insert_large_chunk(M, X, S) {\
+  tbinptr *H;\
+  bindex_t I;\
+  compute_tree_index(S, I);\
+  H = treebin_at(M, I);\
+  X->index = I;\
+  X->child[0] = X->child[1] = 0;\
+  if (!treemap_is_marked(M, I)) {\
+    mark_treemap(M, I);\
+    *H = X;\
+    X->parent = (tchunkptr)H;\
+    X->fd = X->bk = X;\
+  } else {\
+    tchunkptr T = *H;\
+    size_t K = S << leftshift_for_tree_index(I);\
+    for (;;) {\
+      if (chunksize(T) != S) {\
+	tchunkptr *C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
+	K <<= 1;\
+	if (*C != 0) {\
+	  T = *C;\
+	} else {\
+	  *C = X;\
+	  X->parent = T;\
+	  X->fd = X->bk = X;\
+	  break;\
+	}\
+      } else {\
+	tchunkptr F = T->fd;\
+	T->fd = F->bk = X;\
+	X->fd = F;\
+	X->bk = T;\
+	X->parent = 0;\
+	break;\
+      }\
+    }\
+  }\
+}
+
+#define unlink_large_chunk(M, X) {\
+  tchunkptr XP = X->parent;\
+  tchunkptr R;\
+  if (X->bk != X) {\
+    tchunkptr F = X->fd;\
+    R = X->bk;\
+    F->bk = R;\
+    R->fd = F;\
+  } else {\
+    tchunkptr *RP;\
+    if (((R = *(RP = &(X->child[1]))) != 0) ||\
+	((R = *(RP = &(X->child[0]))) != 0)) {\
+      tchunkptr *CP;\
+      while ((*(CP = &(R->child[1])) != 0) ||\
+	     (*(CP = &(R->child[0])) != 0)) {\
+	R = *(RP = CP);\
+      }\
+      *RP = 0;\
+    }\
+  }\
+  if (XP != 0) {\
+    tbinptr *H = treebin_at(M, X->index);\
+    if (X == *H) {\
+      if ((*H = R) == 0) \
+	clear_treemap(M, X->index);\
+    } else {\
+      if (XP->child[0] == X) \
+	XP->child[0] = R;\
+      else \
+	XP->child[1] = R;\
+    }\
+    if (R != 0) {\
+      tchunkptr C0, C1;\
+      R->parent = XP;\
+      if ((C0 = X->child[0]) != 0) {\
+	R->child[0] = C0;\
+	C0->parent = R;\
+      }\
+      if ((C1 = X->child[1]) != 0) {\
+	R->child[1] = C1;\
+	C1->parent = R;\
+      }\
+    }\
+  }\
+}
+
+/* Relays to large vs small bin operations */
+
+#define insert_chunk(M, P, S)\
+  if (is_small(S)) { insert_small_chunk(M, P, S)\
+  } else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
+
+#define unlink_chunk(M, P, S)\
+  if (is_small(S)) { unlink_small_chunk(M, P, S)\
+  } else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
+
+/* -----------------------  Direct-mmapping chunks ----------------------- */
+
+static void *direct_alloc(size_t nb)
+{
+  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  if (LJ_LIKELY(mmsize > nb)) {     /* Check for wrap around 0 */
+    char *mm = (char *)(DIRECT_MMAP(mmsize));
+    if (mm != CMFAIL) {
+      size_t offset = align_offset(chunk2mem(mm));
+      size_t psize = mmsize - offset - DIRECT_FOOT_PAD;
+      mchunkptr p = (mchunkptr)(mm + offset);
+      p->prev_foot = offset | IS_DIRECT_BIT;
+      p->head = psize|CINUSE_BIT;
+      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
+      return chunk2mem(p);
+    }
+  }
+  return NULL;
+}
+
+static mchunkptr direct_resize(mchunkptr oldp, size_t nb)
+{
+  size_t oldsize = chunksize(oldp);
+  if (is_small(nb)) /* Can't shrink direct regions below small size */
+    return NULL;
+  /* Keep old chunk if big enough but not too big */
+  if (oldsize >= nb + SIZE_T_SIZE &&
+      (oldsize - nb) <= (DEFAULT_GRANULARITY << 1)) {
+    return oldp;
+  } else {
+    size_t offset = oldp->prev_foot & ~IS_DIRECT_BIT;
+    size_t oldmmsize = oldsize + offset + DIRECT_FOOT_PAD;
+    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+    char *cp = (char *)CALL_MREMAP((char *)oldp - offset,
+				   oldmmsize, newmmsize, 1);
+    if (cp != CMFAIL) {
+      mchunkptr newp = (mchunkptr)(cp + offset);
+      size_t psize = newmmsize - offset - DIRECT_FOOT_PAD;
+      newp->head = psize|CINUSE_BIT;
+      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
+      return newp;
+    }
+  }
+  return NULL;
+}
+
+/* -------------------------- mspace management -------------------------- */
+
+/* Initialize top chunk and its size */
+static void init_top(mstate m, mchunkptr p, size_t psize)
+{
+  /* Ensure alignment */
+  size_t offset = align_offset(chunk2mem(p));
+  p = (mchunkptr)((char *)p + offset);
+  psize -= offset;
+
+  m->top = p;
+  m->topsize = psize;
+  p->head = psize | PINUSE_BIT;
+  /* set size of fake trailing chunk holding overhead space only once */
+  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
+  m->trim_check = DEFAULT_TRIM_THRESHOLD; /* reset on each update */
+}
+
+/* Initialize bins for a new mstate that is otherwise zeroed out */
+static void init_bins(mstate m)
+{
+  /* Establish circular links for smallbins */
+  bindex_t i;
+  for (i = 0; i < NSMALLBINS; i++) {
+    sbinptr bin = smallbin_at(m,i);
+    bin->fd = bin->bk = bin;
+  }
+}
+
+/* Allocate chunk and prepend remainder with chunk in successor base. */
+static void *prepend_alloc(mstate m, char *newbase, char *oldbase, size_t nb)
+{
+  mchunkptr p = align_as_chunk(newbase);
+  mchunkptr oldfirst = align_as_chunk(oldbase);
+  size_t psize = (size_t)((char *)oldfirst - (char *)p);
+  mchunkptr q = chunk_plus_offset(p, nb);
+  size_t qsize = psize - nb;
+  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+
+  /* consolidate remainder with first chunk of old base */
+  if (oldfirst == m->top) {
+    size_t tsize = m->topsize += qsize;
+    m->top = q;
+    q->head = tsize | PINUSE_BIT;
+  } else if (oldfirst == m->dv) {
+    size_t dsize = m->dvsize += qsize;
+    m->dv = q;
+    set_size_and_pinuse_of_free_chunk(q, dsize);
+  } else {
+    if (!cinuse(oldfirst)) {
+      size_t nsize = chunksize(oldfirst);
+      unlink_chunk(m, oldfirst, nsize);
+      oldfirst = chunk_plus_offset(oldfirst, nsize);
+      qsize += nsize;
+    }
+    set_free_with_pinuse(q, qsize, oldfirst);
+    insert_chunk(m, q, qsize);
+  }
+
+  return chunk2mem(p);
+}
+
+/* Add a segment to hold a new noncontiguous region */
+static void add_segment(mstate m, char *tbase, size_t tsize)
+{
+  /* Determine locations and sizes of segment, fenceposts, old top */
+  char *old_top = (char *)m->top;
+  msegmentptr oldsp = segment_holding(m, old_top);
+  char *old_end = oldsp->base + oldsp->size;
+  size_t ssize = pad_request(sizeof(struct malloc_segment));
+  char *rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  size_t offset = align_offset(chunk2mem(rawsp));
+  char *asp = rawsp + offset;
+  char *csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
+  mchunkptr sp = (mchunkptr)csp;
+  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
+  mchunkptr tnext = chunk_plus_offset(sp, ssize);
+  mchunkptr p = tnext;
+
+  /* reset top to new space */
+  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+
+  /* Set up segment record */
+  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
+  *ss = m->seg; /* Push current record */
+  m->seg.base = tbase;
+  m->seg.size = tsize;
+  m->seg.next = ss;
+
+  /* Insert trailing fenceposts */
+  for (;;) {
+    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
+    p->head = FENCEPOST_HEAD;
+    if ((char *)(&(nextp->head)) < old_end)
+      p = nextp;
+    else
+      break;
+  }
+
+  /* Insert the rest of old top into a bin as an ordinary free chunk */
+  if (csp != old_top) {
+    mchunkptr q = (mchunkptr)old_top;
+    size_t psize = (size_t)(csp - old_top);
+    mchunkptr tn = chunk_plus_offset(q, psize);
+    set_free_with_pinuse(q, psize, tn);
+    insert_chunk(m, q, psize);
+  }
+}
+
+/* -------------------------- System allocation -------------------------- */
+
+static void *alloc_sys(mstate m, size_t nb)
+{
+  char *tbase = CMFAIL;
+  size_t tsize = 0;
+
+  /* Directly map large chunks */
+  if (LJ_UNLIKELY(nb >= DEFAULT_MMAP_THRESHOLD)) {
+    void *mem = direct_alloc(nb);
+    if (mem != 0)
+      return mem;
+  }
+
+  {
+    size_t req = nb + TOP_FOOT_SIZE + SIZE_T_ONE;
+    size_t rsize = granularity_align(req);
+    if (LJ_LIKELY(rsize > nb)) { /* Fail if wraps around zero */
+      char *mp = (char *)(CALL_MMAP(rsize));
+      if (mp != CMFAIL) {
+	tbase = mp;
+	tsize = rsize;
+      }
+    }
+  }
+
+  if (tbase != CMFAIL) {
+    msegmentptr sp = &m->seg;
+    /* Try to merge with an existing segment */
+    while (sp != 0 && tbase != sp->base + sp->size)
+      sp = sp->next;
+    if (sp != 0 && segment_holds(sp, m->top)) { /* append */
+      sp->size += tsize;
+      init_top(m, m->top, m->topsize + tsize);
+    } else {
+      sp = &m->seg;
+      while (sp != 0 && sp->base != tbase + tsize)
+	sp = sp->next;
+      if (sp != 0) {
+	char *oldbase = sp->base;
+	sp->base = tbase;
+	sp->size += tsize;
+	return prepend_alloc(m, tbase, oldbase, nb);
+      } else {
+	add_segment(m, tbase, tsize);
+      }
+    }
+
+    if (nb < m->topsize) { /* Allocate from new or extended top space */
+      size_t rsize = m->topsize -= nb;
+      mchunkptr p = m->top;
+      mchunkptr r = m->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+      return chunk2mem(p);
+    }
+  }
+
+  return NULL;
+}
+
+/* -----------------------  system deallocation -------------------------- */
+
+/* Unmap and unlink any mmapped segments that don't contain used chunks */
+static size_t release_unused_segments(mstate m)
+{
+  size_t released = 0;
+  size_t nsegs = 0;
+  msegmentptr pred = &m->seg;
+  msegmentptr sp = pred->next;
+  while (sp != 0) {
+    char *base = sp->base;
+    size_t size = sp->size;
+    msegmentptr next = sp->next;
+    nsegs++;
+    {
+      mchunkptr p = align_as_chunk(base);
+      size_t psize = chunksize(p);
+      /* Can unmap if first chunk holds entire segment and not pinned */
+      if (!cinuse(p) && (char *)p + psize >= base + size - TOP_FOOT_SIZE) {
+	tchunkptr tp = (tchunkptr)p;
+	if (p == m->dv) {
+	  m->dv = 0;
+	  m->dvsize = 0;
+	} else {
+	  unlink_large_chunk(m, tp);
+	}
+	if (CALL_MUNMAP(base, size) == 0) {
+	  released += size;
+	  /* unlink obsoleted record */
+	  sp = pred;
+	  sp->next = next;
+	} else { /* back out if cannot unmap */
+	  insert_large_chunk(m, tp, psize);
+	}
+      }
+    }
+    pred = sp;
+    sp = next;
+  }
+  /* Reset check counter */
+  m->release_checks = nsegs > MAX_RELEASE_CHECK_RATE ?
+		      nsegs : MAX_RELEASE_CHECK_RATE;
+  return released;
+}
+
+static int alloc_trim(mstate m, size_t pad)
+{
+  size_t released = 0;
+  if (pad < MAX_REQUEST && is_initialized(m)) {
+    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
+
+    if (m->topsize > pad) {
+      /* Shrink top space in granularity-size units, keeping at least one */
+      size_t unit = DEFAULT_GRANULARITY;
+      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
+		      SIZE_T_ONE) * unit;
+      msegmentptr sp = segment_holding(m, (char *)m->top);
+
+      if (sp->size >= extra &&
+	  !has_segment_link(m, sp)) { /* can't shrink if pinned */
+	size_t newsize = sp->size - extra;
+	/* Prefer mremap, fall back to munmap */
+	if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
+	    (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
+	  released = extra;
+	}
+      }
+
+      if (released != 0) {
+	sp->size -= released;
+	init_top(m, m->top, m->topsize - released);
+      }
+    }
+
+    /* Unmap any unused mmapped segments */
+    released += release_unused_segments(m);
+
+    /* On failure, disable autotrim to avoid repeated failed future calls */
+    if (released == 0 && m->topsize > m->trim_check)
+      m->trim_check = MAX_SIZE_T;
+  }
+
+  return (released != 0)? 1 : 0;
+}
+
+/* ---------------------------- malloc support --------------------------- */
+
+/* allocate a large request from the best fitting chunk in a treebin */
+static void *tmalloc_large(mstate m, size_t nb)
+{
+  tchunkptr v = 0;
+  size_t rsize = ~nb+1; /* Unsigned negation */
+  tchunkptr t;
+  bindex_t idx;
+  compute_tree_index(nb, idx);
+
+  if ((t = *treebin_at(m, idx)) != 0) {
+    /* Traverse tree for this bin looking for node with size == nb */
+    size_t sizebits = nb << leftshift_for_tree_index(idx);
+    tchunkptr rst = 0;  /* The deepest untaken right subtree */
+    for (;;) {
+      tchunkptr rt;
+      size_t trem = chunksize(t) - nb;
+      if (trem < rsize) {
+	v = t;
+	if ((rsize = trem) == 0)
+	  break;
+      }
+      rt = t->child[1];
+      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+      if (rt != 0 && rt != t)
+	rst = rt;
+      if (t == 0) {
+	t = rst; /* set t to least subtree holding sizes > nb */
+	break;
+      }
+      sizebits <<= 1;
+    }
+  }
+
+  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
+    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
+    if (leftbits != 0)
+      t = *treebin_at(m, lj_ffs(leftbits));
+  }
+
+  while (t != 0) { /* find smallest of tree or subtree */
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+    t = leftmost_child(t);
+  }
+
+  /*  If dv is a better fit, return NULL so malloc will use it */
+  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
+    mchunkptr r = chunk_plus_offset(v, nb);
+    unlink_large_chunk(m, v);
+    if (rsize < MIN_CHUNK_SIZE) {
+      set_inuse_and_pinuse(m, v, (rsize + nb));
+    } else {
+      set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+      set_size_and_pinuse_of_free_chunk(r, rsize);
+      insert_chunk(m, r, rsize);
+    }
+    return chunk2mem(v);
+  }
+  return NULL;
+}
+
+/* allocate a small request from the best fitting chunk in a treebin */
+static void *tmalloc_small(mstate m, size_t nb)
+{
+  tchunkptr t, v;
+  mchunkptr r;
+  size_t rsize;
+  bindex_t i = lj_ffs(m->treemap);
+
+  v = t = *treebin_at(m, i);
+  rsize = chunksize(t) - nb;
+
+  while ((t = leftmost_child(t)) != 0) {
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+  }
+
+  r = chunk_plus_offset(v, nb);
+  unlink_large_chunk(m, v);
+  if (rsize < MIN_CHUNK_SIZE) {
+    set_inuse_and_pinuse(m, v, (rsize + nb));
+  } else {
+    set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+    set_size_and_pinuse_of_free_chunk(r, rsize);
+    replace_dv(m, r, rsize);
+  }
+  return chunk2mem(v);
+}
+
+/* ----------------------------------------------------------------------- */
+
+void *lj_alloc_create(void)
+{
+  size_t tsize = DEFAULT_GRANULARITY;
+  char *tbase = (char *)(CALL_MMAP(tsize));
+  if (tbase != CMFAIL) {
+    size_t msize = pad_request(sizeof(struct malloc_state));
+    mchunkptr mn;
+    mchunkptr msp = align_as_chunk(tbase);
+    mstate m = (mstate)(chunk2mem(msp));
+    memset(m, 0, msize);
+    msp->head = (msize|PINUSE_BIT|CINUSE_BIT);
+    m->seg.base = tbase;
+    m->seg.size = tsize;
+    m->release_checks = MAX_RELEASE_CHECK_RATE;
+    init_bins(m);
+    mn = next_chunk(mem2chunk(m));
+    init_top(m, mn, (size_t)((tbase + tsize) - (char *)mn) - TOP_FOOT_SIZE);
+    return m;
+  }
+  return NULL;
+}
+
+void lj_alloc_destroy(void *msp)
+{
+  mstate ms = (mstate)msp;
+  msegmentptr sp = &ms->seg;
+  while (sp != 0) {
+    char *base = sp->base;
+    size_t size = sp->size;
+    sp = sp->next;
+    CALL_MUNMAP(base, size);
+  }
+}
+
+static LJ_NOINLINE void *lj_alloc_malloc(void *msp, size_t nsize)
+{
+  mstate ms = (mstate)msp;
+  void *mem;
+  size_t nb;
+  if (nsize <= MAX_SMALL_REQUEST) {
+    bindex_t idx;
+    binmap_t smallbits;
+    nb = (nsize < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(nsize);
+    idx = small_index(nb);
+    smallbits = ms->smallmap >> idx;
+
+    if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+      mchunkptr b, p;
+      idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+      b = smallbin_at(ms, idx);
+      p = b->fd;
+      unlink_first_small_chunk(ms, b, p, idx);
+      set_inuse_and_pinuse(ms, p, small_index2size(idx));
+      mem = chunk2mem(p);
+      return mem;
+    } else if (nb > ms->dvsize) {
+      if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+	mchunkptr b, p, r;
+	size_t rsize;
+	binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+	bindex_t i = lj_ffs(leftbits);
+	b = smallbin_at(ms, i);
+	p = b->fd;
+	unlink_first_small_chunk(ms, b, p, i);
+	rsize = small_index2size(i) - nb;
+	/* Fit here cannot be remainderless if 4byte sizes */
+	if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) {
+	  set_inuse_and_pinuse(ms, p, small_index2size(i));
+	} else {
+	  set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+	  r = chunk_plus_offset(p, nb);
+	  set_size_and_pinuse_of_free_chunk(r, rsize);
+	  replace_dv(ms, r, rsize);
+	}
+	mem = chunk2mem(p);
+	return mem;
+      } else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
+	return mem;
+      }
+    }
+  } else if (nsize >= MAX_REQUEST) {
+    nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+  } else {
+    nb = pad_request(nsize);
+    if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
+      return mem;
+    }
+  }
+
+  if (nb <= ms->dvsize) {
+    size_t rsize = ms->dvsize - nb;
+    mchunkptr p = ms->dv;
+    if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+      mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
+      ms->dvsize = rsize;
+      set_size_and_pinuse_of_free_chunk(r, rsize);
+      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+    } else { /* exhaust dv */
+      size_t dvs = ms->dvsize;
+      ms->dvsize = 0;
+      ms->dv = 0;
+      set_inuse_and_pinuse(ms, p, dvs);
+    }
+    mem = chunk2mem(p);
+    return mem;
+  } else if (nb < ms->topsize) { /* Split top */
+    size_t rsize = ms->topsize -= nb;
+    mchunkptr p = ms->top;
+    mchunkptr r = ms->top = chunk_plus_offset(p, nb);
+    r->head = rsize | PINUSE_BIT;
+    set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+    mem = chunk2mem(p);
+    return mem;
+  }
+  return alloc_sys(ms, nb);
+}
+
+static LJ_NOINLINE void *lj_alloc_free(void *msp, void *ptr)
+{
+  if (ptr != 0) {
+    mchunkptr p = mem2chunk(ptr);
+    mstate fm = (mstate)msp;
+    size_t psize = chunksize(p);
+    mchunkptr next = chunk_plus_offset(p, psize);
+    if (!pinuse(p)) {
+      size_t prevsize = p->prev_foot;
+      if ((prevsize & IS_DIRECT_BIT) != 0) {
+	prevsize &= ~IS_DIRECT_BIT;
+	psize += prevsize + DIRECT_FOOT_PAD;
+	CALL_MUNMAP((char *)p - prevsize, psize);
+	return NULL;
+      } else {
+	mchunkptr prev = chunk_minus_offset(p, prevsize);
+	psize += prevsize;
+	p = prev;
+	/* consolidate backward */
+	if (p != fm->dv) {
+	  unlink_chunk(fm, p, prevsize);
+	} else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+	  fm->dvsize = psize;
+	  set_free_with_pinuse(p, psize, next);
+	  return NULL;
+	}
+      }
+    }
+    if (!cinuse(next)) {  /* consolidate forward */
+      if (next == fm->top) {
+	size_t tsize = fm->topsize += psize;
+	fm->top = p;
+	p->head = tsize | PINUSE_BIT;
+	if (p == fm->dv) {
+	  fm->dv = 0;
+	  fm->dvsize = 0;
+	}
+	if (tsize > fm->trim_check)
+	  alloc_trim(fm, 0);
+	return NULL;
+      } else if (next == fm->dv) {
+	size_t dsize = fm->dvsize += psize;
+	fm->dv = p;
+	set_size_and_pinuse_of_free_chunk(p, dsize);
+	return NULL;
+      } else {
+	size_t nsize = chunksize(next);
+	psize += nsize;
+	unlink_chunk(fm, next, nsize);
+	set_size_and_pinuse_of_free_chunk(p, psize);
+	if (p == fm->dv) {
+	  fm->dvsize = psize;
+	  return NULL;
+	}
+      }
+    } else {
+      set_free_with_pinuse(p, psize, next);
+    }
+
+    if (is_small(psize)) {
+      insert_small_chunk(fm, p, psize);
+    } else {
+      tchunkptr tp = (tchunkptr)p;
+      insert_large_chunk(fm, tp, psize);
+      if (--fm->release_checks == 0)
+	release_unused_segments(fm);
+    }
+  }
+  return NULL;
+}
+
+static LJ_NOINLINE void *lj_alloc_realloc(void *msp, void *ptr, size_t nsize)
+{
+  if (nsize >= MAX_REQUEST) {
+    return NULL;
+  } else {
+    mstate m = (mstate)msp;
+    mchunkptr oldp = mem2chunk(ptr);
+    size_t oldsize = chunksize(oldp);
+    mchunkptr next = chunk_plus_offset(oldp, oldsize);
+    mchunkptr newp = 0;
+    size_t nb = request2size(nsize);
+
+    /* Try to either shrink or extend into top. Else malloc-copy-free */
+    if (is_direct(oldp)) {
+      newp = direct_resize(oldp, nb);  /* this may return NULL. */
+    } else if (oldsize >= nb) { /* already big enough */
+      size_t rsize = oldsize - nb;
+      newp = oldp;
+      if (rsize >= MIN_CHUNK_SIZE) {
+	mchunkptr remainder = chunk_plus_offset(newp, nb);
+	set_inuse(m, newp, nb);
+	set_inuse(m, remainder, rsize);
+	lj_alloc_free(m, chunk2mem(remainder));
+      }
+    } else if (next == m->top && oldsize + m->topsize > nb) {
+      /* Expand into top */
+      size_t newsize = oldsize + m->topsize;
+      size_t newtopsize = newsize - nb;
+      mchunkptr newtop = chunk_plus_offset(oldp, nb);
+      set_inuse(m, oldp, nb);
+      newtop->head = newtopsize |PINUSE_BIT;
+      m->top = newtop;
+      m->topsize = newtopsize;
+      newp = oldp;
+    }
+
+    if (newp != 0) {
+      return chunk2mem(newp);
+    } else {
+      void *newmem = lj_alloc_malloc(m, nsize);
+      if (newmem != 0) {
+	size_t oc = oldsize - overhead_for(oldp);
+	memcpy(newmem, ptr, oc < nsize ? oc : nsize);
+	lj_alloc_free(m, ptr);
+      }
+      return newmem;
+    }
+  }
+}
+
+void *lj_alloc_f(void *msp, void *ptr, size_t osize, size_t nsize)
+{
+  (void)osize;
+  if (nsize == 0) {
+    return lj_alloc_free(msp, ptr);
+  } else if (ptr == NULL) {
+    return lj_alloc_malloc(msp, nsize);
+  } else {
+    return lj_alloc_realloc(msp, ptr, nsize);
+  }
+}
+
+#endif

+ 17 - 0
src/lj_alloc.h

@@ -0,0 +1,17 @@
+/*
+** Bundled memory allocator.
+** Donated to the public domain.
+*/
+
+#ifndef _LJ_ALLOC_H
+#define _LJ_ALLOC_H
+
+#include "lj_def.h"
+
+#ifndef LUAJIT_USE_SYSMALLOC
+LJ_FUNC void *lj_alloc_create(void);
+LJ_FUNC void lj_alloc_destroy(void *msp);
+LJ_FUNC void *lj_alloc_f(void *msp, void *ptr, size_t osize, size_t nsize);
+#endif
+
+#endif

+ 1046 - 0
src/lj_api.c

@@ -0,0 +1,1046 @@
+/*
+** Public Lua/C API.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_api_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_udata.h"
+#include "lj_meta.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_trace.h"
+#include "lj_vm.h"
+#include "lj_lex.h"
+#include "lj_parse.h"
+
+/* -- Common helper functions --------------------------------------------- */
+
+#define api_checknelems(L, n)	api_check(L, (n) <= (L->top - L->base))
+#define api_checkvalidindex(L, i)	api_check(L, (i) != niltv(L))
+
+static TValue *index2adr(lua_State *L, int idx)
+{
+  if (idx > 0) {
+    TValue *o = L->base + (idx - 1);
+    return o < L->top ? o : niltv(L);
+  } else if (idx > LUA_REGISTRYINDEX) {
+    api_check(L, idx != 0 && -idx <= L->top - L->base);
+    return L->top + idx;
+  } else if (idx == LUA_GLOBALSINDEX) {
+    TValue *o = &G(L)->tmptv;
+    settabV(L, o, tabref(L->env));
+    return o;
+  } else if (idx == LUA_REGISTRYINDEX) {
+    return registry(L);
+  } else {
+    GCfunc *fn = curr_func(L);
+    api_check(L, fn->c.gct == ~LJ_TFUNC && !isluafunc(fn));
+    if (idx == LUA_ENVIRONINDEX) {
+      TValue *o = &G(L)->tmptv;
+      settabV(L, o, tabref(fn->c.env));
+      return o;
+    } else {
+      idx = LUA_GLOBALSINDEX - idx;
+      return idx <= fn->c.nupvalues ? &fn->c.upvalue[idx-1] : niltv(L);
+    }
+  }
+}
+
+static TValue *stkindex2adr(lua_State *L, int idx)
+{
+  if (idx > 0) {
+    TValue *o = L->base + (idx - 1);
+    return o < L->top ? o : niltv(L);
+  } else {
+    api_check(L, idx != 0 && -idx <= L->top - L->base);
+    return L->top + idx;
+  }
+}
+
+static GCtab *getcurrenv(lua_State *L)
+{
+  GCfunc *fn = curr_func(L);
+  return fn->c.gct == ~LJ_TFUNC ? tabref(fn->c.env) : tabref(L->env);
+}
+
+/* -- Miscellaneous API functions ----------------------------------------- */
+
+LUA_API int lua_status(lua_State *L)
+{
+  return L->status;
+}
+
+LUA_API int lua_checkstack(lua_State *L, int size)
+{
+  if (size > LUAI_MAXCSTACK || (L->top - L->base + size) > LUAI_MAXCSTACK) {
+    return 0;  /* Stack overflow. */
+  } else if (size > 0) {
+    lj_state_checkstack(L, (MSize)size);
+  }
+  return 1;
+}
+
+LUA_API void lua_xmove(lua_State *from, lua_State *to, int n)
+{
+  TValue *f, *t;
+  if (from == to) return;
+  api_checknelems(from, n);
+  api_check(from, G(from) == G(to));
+  lj_state_checkstack(to, (MSize)n);
+  f = from->top;
+  t = to->top = to->top + n;
+  while (--n >= 0) copyTV(to, --t, --f);
+  from->top = f;
+}
+
+/* -- Stack manipulation -------------------------------------------------- */
+
+LUA_API int lua_gettop(lua_State *L)
+{
+  return cast_int(L->top - L->base);
+}
+
+LUA_API void lua_settop(lua_State *L, int idx)
+{
+  if (idx >= 0) {
+    api_check(L, idx <= L->maxstack - L->base);
+    if (L->base + idx > L->top) {
+      if (L->base + idx >= L->maxstack)
+	lj_state_growstack(L, (MSize)idx - (MSize)(L->top - L->base));
+      do { setnilV(L->top++); } while (L->top < L->base + idx);
+    } else {
+      L->top = L->base + idx;
+    }
+  } else {
+    api_check(L, -(idx+1) <= (L->top - L->base));
+    L->top += idx+1;  /* Shrinks top (idx < 0). */
+  }
+}
+
+LUA_API void lua_remove(lua_State *L, int idx)
+{
+  TValue *p = stkindex2adr(L, idx);
+  api_checkvalidindex(L, p);
+  while (++p < L->top) copyTV(L, p-1, p);
+  L->top--;
+}
+
+LUA_API void lua_insert(lua_State *L, int idx)
+{
+  TValue *q, *p = stkindex2adr(L, idx);
+  api_checkvalidindex(L, p);
+  for (q = L->top; q > p; q--) copyTV(L, q, q-1);
+  copyTV(L, p, L->top);
+}
+
+LUA_API void lua_replace(lua_State *L, int idx)
+{
+  api_checknelems(L, 1);
+  if (idx == LUA_GLOBALSINDEX) {
+    api_check(L, tvistab(L->top-1));
+    /* NOBARRIER: A thread (i.e. L) is never black. */
+    setgcref(L->env, obj2gco(tabV(L->top-1)));
+  } else if (idx == LUA_ENVIRONINDEX) {
+    GCfunc *fn = curr_func(L);
+    if (fn->c.gct != ~LJ_TFUNC)
+      lj_err_msg(L, LJ_ERR_NOENV);
+    api_check(L, tvistab(L->top-1));
+    setgcref(fn->c.env, obj2gco(tabV(L->top-1)));
+    lj_gc_barrier(L, fn, L->top-1);
+  } else {
+    TValue *o = index2adr(L, idx);
+    api_checkvalidindex(L, o);
+    copyTV(L, o, L->top-1);
+    if (idx < LUA_GLOBALSINDEX)  /* Need a barrier for upvalues. */
+      lj_gc_barrier(L, curr_func(L), L->top-1);
+  }
+  L->top--;
+}
+
+LUA_API void lua_pushvalue(lua_State *L, int idx)
+{
+  copyTV(L, L->top, index2adr(L, idx));
+  incr_top(L);
+}
+
+/* -- Stack getters ------------------------------------------------------- */
+
+LUA_API int lua_type(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  if (tvisnum(o)) {
+    return LUA_TNUMBER;
+#if LJ_64
+  } else if (tvislightud(o)) {
+    return LUA_TLIGHTUSERDATA;
+#endif
+  } else if (o == niltv(L)) {
+    return LUA_TNONE;
+  } else {  /* Magic internal/external tag conversion. ORDER LJ_T */
+    int t = ~itype(o);
+    return (int)(((t < 8 ? 0x98a42110 : 0x75b6) >> 4*(t&7)) & 15u);
+  }
+}
+
+LUA_API const char *lua_typename(lua_State *L, int t)
+{
+  UNUSED(L);
+  return lj_obj_typename[t+1];
+}
+
+LUA_API int lua_iscfunction(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return !isluafunc(funcV(o));
+}
+
+LUA_API int lua_isnumber(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  TValue tmp;
+  return (tvisnum(o) || (tvisstr(o) && lj_str_numconv(strVdata(o), &tmp)));
+}
+
+LUA_API int lua_isstring(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return (tvisstr(o) || tvisnum(o));
+}
+
+LUA_API int lua_isuserdata(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return (tvisudata(o) || tvislightud(o));
+}
+
+LUA_API int lua_rawequal(lua_State *L, int idx1, int idx2)
+{
+  cTValue *o1 = index2adr(L, idx1);
+  cTValue *o2 = index2adr(L, idx2);
+  return (o1 == niltv(L) || o2 == niltv(L)) ? 0 : lj_obj_equal(o1, o2);
+}
+
+LUA_API int lua_equal(lua_State *L, int idx1, int idx2)
+{
+  cTValue *o1 = index2adr(L, idx1);
+  cTValue *o2 = index2adr(L, idx2);
+  if (tvisnum(o1) && tvisnum(o2)) {
+    return numV(o1) == numV(o2);
+  } else if (itype(o1) != itype(o2)) {
+    return 0;
+  } else if (tvispri(o1)) {
+    return o1 != niltv(L) && o2 != niltv(L);
+#if LJ_64
+  } else if (tvislightud(o1)) {
+    return o1->u64 == o2->u64;
+#endif
+  } else if (gcrefeq(o1->gcr, o2->gcr)) {
+    return 1;
+  } else if (!tvistabud(o1)) {
+    return 0;
+  } else {
+    TValue *base = lj_meta_equal(L, gcV(o1), gcV(o2), 0);
+    if ((uintptr_t)base <= 1) {
+      return (int)(uintptr_t)base;
+    } else {
+      L->top = base+2;
+      lj_vm_call(L, base, 1+1);
+      L->top -= 2;
+      return tvistruecond(L->top+1);
+    }
+  }
+}
+
+LUA_API int lua_lessthan(lua_State *L, int idx1, int idx2)
+{
+  cTValue *o1 = index2adr(L, idx1);
+  cTValue *o2 = index2adr(L, idx2);
+  if (o1 == niltv(L) || o2 == niltv(L)) {
+    return 0;
+  } else if (tvisnum(o1) && tvisnum(o2)) {
+    return numV(o1) < numV(o2);
+  } else {
+    TValue *base = lj_meta_comp(L, o1, o2, 0);
+    if ((uintptr_t)base <= 1) {
+      return (int)(uintptr_t)base;
+    } else {
+      L->top = base+2;
+      lj_vm_call(L, base, 1+1);
+      L->top -= 2;
+      return tvistruecond(L->top+1);
+    }
+  }
+}
+
+LUA_API lua_Number lua_tonumber(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  TValue tmp;
+  if (LJ_LIKELY(tvisnum(o)))
+    return numV(o);
+  else if (tvisstr(o) && lj_str_numconv(strVdata(o), &tmp))
+    return numV(&tmp);
+  else
+    return 0;
+}
+
+LUA_API lua_Integer lua_tointeger(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  TValue tmp;
+  lua_Number n;
+  if (LJ_LIKELY(tvisnum(o)))
+    n = numV(o);
+  else if (tvisstr(o) && lj_str_numconv(strVdata(o), &tmp))
+    n = numV(&tmp);
+  else
+    return 0;
+#if LJ_64
+  return (lua_Integer)n;
+#else
+  return lj_num2int(n);
+#endif
+}
+
+LUA_API int lua_toboolean(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return tvistruecond(o);
+}
+
+LUA_API const char *lua_tolstring(lua_State *L, int idx, size_t *len)
+{
+  TValue *o = index2adr(L, idx);
+  GCstr *s;
+  if (LJ_LIKELY(tvisstr(o))) {
+    s = strV(o);
+  } else if (tvisnum(o)) {
+    lj_gc_check(L);
+    o = index2adr(L, idx);  /* GC may move the stack. */
+    s = lj_str_fromnum(L, &o->n);
+  } else {
+    if (len != NULL) *len = 0;
+    return NULL;
+  }
+  if (len != NULL) *len = s->len;
+  return strdata(s);
+}
+
+LUA_API size_t lua_objlen(lua_State *L, int idx)
+{
+  TValue *o = index2adr(L, idx);
+  if (tvisstr(o))
+    return strV(o)->len;
+  else if (tvistab(o))
+    return cast(size_t, lj_tab_len(tabV(o)));
+  else if (tvisudata(o))
+    return udataV(o)->len;
+  else if (tvisnum(o))
+    return lj_str_fromnum(L, &o->n)->len;
+  else
+    return 0;
+}
+
+LUA_API lua_CFunction lua_tocfunction(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return funcV(o)->c.gate == lj_gate_c ? funcV(o)->c.f : NULL;
+}
+
+LUA_API void *lua_touserdata(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  if (tvisudata(o))
+    return uddata(udataV(o));
+  else if (tvislightud(o))
+    return lightudV(o);
+  else
+    return NULL;
+}
+
+LUA_API lua_State *lua_tothread(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  return (!tvisthread(o)) ? NULL : threadV(o);
+}
+
+LUA_API const void *lua_topointer(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  if (tvisudata(o))
+    return uddata(udataV(o));
+  else if (tvislightud(o))
+    return lightudV(o);
+  else if (tvisgcv(o))
+    return gcV(o);
+  else
+    return NULL;
+}
+
+/* -- Stack setters (object creation) ------------------------------------- */
+
+LUA_API void lua_pushnil(lua_State *L)
+{
+  setnilV(L->top);
+  incr_top(L);
+}
+
+LUA_API void lua_pushnumber(lua_State *L, lua_Number n)
+{
+  setnumV(L->top, n);
+  if (LJ_UNLIKELY(tvisnan(L->top)))
+    setnanV(L->top);  /* Canonicalize injected NaNs. */
+  incr_top(L);
+}
+
+LUA_API void lua_pushinteger(lua_State *L, lua_Integer n)
+{
+  setnumV(L->top, cast_num(n));
+  incr_top(L);
+}
+
+LUA_API void lua_pushlstring(lua_State *L, const char *str, size_t len)
+{
+  GCstr *s;
+  lj_gc_check(L);
+  s = lj_str_new(L, str, len);
+  setstrV(L, L->top, s);
+  incr_top(L);
+}
+
+LUA_API void lua_pushstring(lua_State *L, const char *str)
+{
+  if (str == NULL) {
+    setnilV(L->top);
+  } else {
+    GCstr *s;
+    lj_gc_check(L);
+    s = lj_str_newz(L, str);
+    setstrV(L, L->top, s);
+  }
+  incr_top(L);
+}
+
+LUA_API const char *lua_pushvfstring(lua_State *L, const char *fmt,
+				     va_list argp)
+{
+  lj_gc_check(L);
+  return lj_str_pushvf(L, fmt, argp);
+}
+
+LUA_API const char *lua_pushfstring(lua_State *L, const char *fmt, ...)
+{
+  const char *ret;
+  va_list argp;
+  lj_gc_check(L);
+  va_start(argp, fmt);
+  ret = lj_str_pushvf(L, fmt, argp);
+  va_end(argp);
+  return ret;
+}
+
+LUA_API void lua_pushcclosure(lua_State *L, lua_CFunction f, int n)
+{
+  GCfunc *fn;
+  lj_gc_check(L);
+  api_checknelems(L, n);
+  fn = lj_func_newC(L, (MSize)n, getcurrenv(L));
+  fn->c.f = f;
+  L->top -= n;
+  while (n--)
+    copyTV(L, &fn->c.upvalue[n], L->top+n);
+  setfuncV(L, L->top, fn);
+  lua_assert(iswhite(obj2gco(fn)));
+  incr_top(L);
+}
+
+LUA_API void lua_pushboolean(lua_State *L, int b)
+{
+  setboolV(L->top, (b != 0));
+  incr_top(L);
+}
+
+LUA_API void lua_pushlightuserdata(lua_State *L, void *p)
+{
+  setlightudV(L->top, checklightudptr(L, p));
+  incr_top(L);
+}
+
+LUA_API void lua_createtable(lua_State *L, int narray, int nrec)
+{
+  GCtab *t;
+  lj_gc_check(L);
+  t = lj_tab_new(L, (uint32_t)(narray > 0 ? narray+1 : 0), hsize2hbits(nrec));
+  settabV(L, L->top, t);
+  incr_top(L);
+}
+
+LUALIB_API int luaL_newmetatable(lua_State *L, const char *tname)
+{
+  GCtab *regt = tabV(registry(L));
+  TValue *tv = lj_tab_setstr(L, regt, lj_str_newz(L, tname));
+  if (tvisnil(tv)) {
+    GCtab *mt = lj_tab_new(L, 0, 1);
+    settabV(L, tv, mt);
+    settabV(L, L->top++, mt);
+    lj_gc_objbarriert(L, regt, mt);
+    return 1;
+  } else {
+    copyTV(L, L->top++, tv);
+    return 0;
+  }
+}
+
+LUA_API int lua_pushthread(lua_State *L)
+{
+  setthreadV(L, L->top, L);
+  incr_top(L);
+  return (mainthread(G(L)) == L);
+}
+
+LUA_API lua_State *lua_newthread(lua_State *L)
+{
+  lua_State *L1;
+  lj_gc_check(L);
+  L1 = lj_state_new(L);
+  setthreadV(L, L->top, L1);
+  incr_top(L);
+  return L1;
+}
+
+LUA_API void *lua_newuserdata(lua_State *L, size_t size)
+{
+  GCudata *ud;
+  lj_gc_check(L);
+  if (size > LJ_MAX_UDATA)
+    lj_err_msg(L, LJ_ERR_UDATAOV);
+  ud = lj_udata_new(L, (MSize)size, getcurrenv(L));
+  setudataV(L, L->top, ud);
+  incr_top(L);
+  return uddata(ud);
+}
+
+LUA_API void lua_concat(lua_State *L, int n)
+{
+  api_checknelems(L, n);
+  if (n >= 2) {
+    n--;
+    do {
+      TValue *top = lj_meta_cat(L, L->top-1, n);
+      if (top == NULL) {
+	L->top -= n;
+	break;
+      }
+      n -= cast_int(L->top - top);
+      L->top = top+2;
+      lj_vm_call(L, top, 1+1);
+      L->top--;
+      copyTV(L, L->top-1, L->top);
+    } while (--n > 0);
+  } else if (n == 0) {  /* Push empty string. */
+    setstrV(L, L->top, lj_str_new(L, "", 0));
+    incr_top(L);
+  }
+  /* else n == 1: nothing to do. */
+}
+
+/* -- Object getters ------------------------------------------------------ */
+
+LUA_API void lua_gettable(lua_State *L, int idx)
+{
+  cTValue *v, *t = index2adr(L, idx);
+  api_checkvalidindex(L, t);
+  v = lj_meta_tget(L, t, L->top-1);
+  if (v == NULL) {
+    L->top += 2;
+    lj_vm_call(L, L->top-2, 1+1);
+    L->top -= 2;
+    v = L->top+1;
+  }
+  copyTV(L, L->top-1, v);
+}
+
+LUA_API void lua_getfield(lua_State *L, int idx, const char *k)
+{
+  cTValue *v, *t = index2adr(L, idx);
+  TValue key;
+  api_checkvalidindex(L, t);
+  setstrV(L, &key, lj_str_newz(L, k));
+  v = lj_meta_tget(L, t, &key);
+  if (v == NULL) {
+    L->top += 2;
+    lj_vm_call(L, L->top-2, 1+1);
+    L->top -= 2;
+    v = L->top+1;
+  }
+  copyTV(L, L->top, v);
+  incr_top(L);
+}
+
+LUA_API void lua_rawget(lua_State *L, int idx)
+{
+  cTValue *t = index2adr(L, idx);
+  api_check(L, tvistab(t));
+  copyTV(L, L->top-1, lj_tab_get(L, tabV(t), L->top-1));
+}
+
+LUA_API void lua_rawgeti(lua_State *L, int idx, int n)
+{
+  cTValue *v, *t = index2adr(L, idx);
+  api_check(L, tvistab(t));
+  v = lj_tab_getint(tabV(t), n);
+  if (v) {
+    copyTV(L, L->top, v);
+  } else {
+    setnilV(L->top);
+  }
+  incr_top(L);
+}
+
+LUA_API int lua_getmetatable(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  GCtab *mt = NULL;
+  if (tvistab(o))
+    mt = tabref(tabV(o)->metatable);
+  else if (tvisudata(o))
+    mt = tabref(udataV(o)->metatable);
+  else
+    mt = tabref(G(L)->basemt[itypemap(o)]);
+  if (mt == NULL)
+    return 0;
+  settabV(L, L->top, mt);
+  incr_top(L);
+  return 1;
+}
+
+LUALIB_API int luaL_getmetafield(lua_State *L, int idx, const char *field)
+{
+  if (lua_getmetatable(L, idx)) {
+    cTValue *tv = lj_tab_getstr(tabV(L->top-1), lj_str_newz(L, field));
+    if (tv && !tvisnil(tv)) {
+      copyTV(L, L->top-1, tv);
+      return 1;
+    }
+    L->top--;
+  }
+  return 0;
+}
+
+LUA_API void lua_getfenv(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  api_checkvalidindex(L, o);
+  if (tvisfunc(o)) {
+    settabV(L, L->top, tabref(funcV(o)->c.env));
+  } else if (tvisudata(o)) {
+    settabV(L, L->top, tabref(udataV(o)->env));
+  } else if (tvisthread(o)) {
+    settabV(L, L->top, tabref(threadV(o)->env));
+  } else {
+    setnilV(L->top);
+  }
+  incr_top(L);
+}
+
+LUA_API int lua_next(lua_State *L, int idx)
+{
+  cTValue *t = index2adr(L, idx);
+  int more;
+  api_check(L, tvistab(t));
+  more = lj_tab_next(L, tabV(t), L->top-1);
+  if (more) {
+    incr_top(L);  /* Return new key and value slot. */
+  } else {  /* End of traversal. */
+    L->top--;  /* Remove key slot. */
+  }
+  return more;
+}
+
+static const char *aux_upvalue(cTValue *f, uint32_t idx, TValue **val)
+{
+  GCfunc *fn;
+  if (!tvisfunc(f)) return NULL;
+  fn = funcV(f);
+  if (isluafunc(fn)) {
+    GCproto *pt = funcproto(fn);
+    if (idx < pt->sizeuvname) {
+      *val = gcref(fn->l.uvptr[idx])->uv.v;
+      return strdata(pt->uvname[idx]);
+    }
+  } else {
+    if (idx < fn->c.nupvalues) {
+      *val = &fn->c.upvalue[idx];
+      return "";
+    }
+  }
+  return NULL;
+}
+
+LUA_API const char *lua_getupvalue(lua_State *L, int idx, int n)
+{
+  TValue *val;
+  const char *name = aux_upvalue(index2adr(L, idx), (uint32_t)(n-1), &val);
+  if (name) {
+    copyTV(L, L->top, val);
+    incr_top(L);
+  }
+  return name;
+}
+
+LUALIB_API void *luaL_checkudata(lua_State *L, int idx, const char *tname)
+{
+  cTValue *o = index2adr(L, idx);
+  if (tvisudata(o)) {
+    GCudata *ud = udataV(o);
+    cTValue *tv = lj_tab_getstr(tabV(registry(L)), lj_str_newz(L, tname));
+    if (tv && tvistab(tv) && tabV(tv) == tabref(ud->metatable))
+      return uddata(ud);
+  }
+  lj_err_argtype(L, idx, tname);
+  return NULL;  /* unreachable */
+}
+
+/* -- Object setters ------------------------------------------------------ */
+
+LUA_API void lua_settable(lua_State *L, int idx)
+{
+  TValue *o;
+  cTValue *t = index2adr(L, idx);
+  api_checknelems(L, 2);
+  api_checkvalidindex(L, t);
+  o = lj_meta_tset(L, t, L->top-2);
+  if (o) {
+    /* NOBARRIER: lj_meta_tset ensures the table is not black. */
+    copyTV(L, o, L->top-1);
+    L->top -= 2;
+  } else {
+    L->top += 3;
+    copyTV(L, L->top-1, L->top-6);
+    lj_vm_call(L, L->top-3, 0+1);
+    L->top -= 3;
+  }
+}
+
+LUA_API void lua_setfield(lua_State *L, int idx, const char *k)
+{
+  TValue *o;
+  TValue key;
+  cTValue *t = index2adr(L, idx);
+  api_checknelems(L, 1);
+  api_checkvalidindex(L, t);
+  setstrV(L, &key, lj_str_newz(L, k));
+  o = lj_meta_tset(L, t, &key);
+  if (o) {
+    L->top--;
+    /* NOBARRIER: lj_meta_tset ensures the table is not black. */
+    copyTV(L, o, L->top);
+  } else {
+    L->top += 3;
+    copyTV(L, L->top-1, L->top-6);
+    lj_vm_call(L, L->top-3, 0+1);
+    L->top -= 2;
+  }
+}
+
+LUA_API void lua_rawset(lua_State *L, int idx)
+{
+  GCtab *t = tabV(index2adr(L, idx));
+  TValue *dst, *key;
+  api_checknelems(L, 2);
+  key = L->top-2;
+  dst = lj_tab_set(L, t, key);
+  copyTV(L, dst, key+1);
+  lj_gc_barriert(L, t, dst);
+  L->top = key;
+}
+
+LUA_API void lua_rawseti(lua_State *L, int idx, int n)
+{
+  GCtab *t = tabV(index2adr(L, idx));
+  TValue *dst, *src;
+  api_checknelems(L, 1);
+  dst = lj_tab_setint(L, t, n);
+  src = L->top-1;
+  copyTV(L, dst, src);
+  lj_gc_barriert(L, t, dst);
+  L->top = src;
+}
+
+LUA_API int lua_setmetatable(lua_State *L, int idx)
+{
+  global_State *g;
+  GCtab *mt;
+  cTValue *o = index2adr(L, idx);
+  api_checknelems(L, 1);
+  api_checkvalidindex(L, o);
+  if (tvisnil(L->top-1)) {
+    mt = NULL;
+  } else {
+    api_check(L, tvistab(L->top-1));
+    mt = tabV(L->top-1);
+  }
+  g = G(L);
+  if (tvistab(o)) {
+    setgcref(tabV(o)->metatable, obj2gco(mt));
+    if (mt)
+      lj_gc_objbarriert(L, tabV(o), mt);
+  } else if (tvisudata(o)) {
+    setgcref(udataV(o)->metatable, obj2gco(mt));
+    if (mt)
+      lj_gc_objbarrier(L, udataV(o), mt);
+  } else {
+    /* Flush cache, since traces specialize to basemt. But not during __gc. */
+    if (lj_trace_flushall(L))
+      lj_err_caller(L, LJ_ERR_NOGCMM);
+    if (tvisbool(o)) {
+      /* NOBARRIER: g->basemt[] is a GC root. */
+      setgcref(g->basemt[~LJ_TTRUE], obj2gco(mt));
+      setgcref(g->basemt[~LJ_TFALSE], obj2gco(mt));
+    } else {
+      /* NOBARRIER: g->basemt[] is a GC root. */
+      setgcref(g->basemt[itypemap(o)], obj2gco(mt));
+    }
+  }
+  L->top--;
+  return 1;
+}
+
+LUA_API int lua_setfenv(lua_State *L, int idx)
+{
+  cTValue *o = index2adr(L, idx);
+  GCtab *t;
+  api_checknelems(L, 1);
+  api_checkvalidindex(L, o);
+  api_check(L, tvistab(L->top-1));
+  t = tabV(L->top-1);
+  if (tvisfunc(o)) {
+    setgcref(funcV(o)->c.env, obj2gco(t));
+  } else if (tvisudata(o)) {
+    setgcref(udataV(o)->env, obj2gco(t));
+  } else if (tvisthread(o)) {
+    setgcref(threadV(o)->env, obj2gco(t));
+  } else {
+    L->top--;
+    return 0;
+  }
+  lj_gc_objbarrier(L, gcV(o), t);
+  L->top--;
+  return 1;
+}
+
+LUA_API const char *lua_setupvalue(lua_State *L, int idx, int n)
+{
+  cTValue *f = index2adr(L, idx);
+  TValue *val;
+  const char *name;
+  api_checknelems(L, 1);
+  name = aux_upvalue(f, (uint32_t)(n-1), &val);
+  if (name) {
+    L->top--;
+    copyTV(L, val, L->top);
+    lj_gc_barrier(L, funcV(f), L->top);
+  }
+  return name;
+}
+
+/* -- Calls --------------------------------------------------------------- */
+
+LUA_API void lua_call(lua_State *L, int nargs, int nresults)
+{
+  api_checknelems(L, nargs+1);
+  lj_vm_call(L, L->top - nargs, nresults+1);
+}
+
+LUA_API int lua_pcall(lua_State *L, int nargs, int nresults, int errfunc)
+{
+  global_State *g = G(L);
+  uint8_t oldh = hook_save(g);
+  ptrdiff_t ef;
+  int status;
+  api_checknelems(L, nargs+1);
+  if (errfunc == 0) {
+    ef = 0;
+  } else {
+    cTValue *o = stkindex2adr(L, errfunc);
+    api_checkvalidindex(L, o);
+    ef = savestack(L, o);
+  }
+  status = lj_vm_pcall(L, L->top - nargs, nresults+1, ef);
+  if (status) hook_restore(g, oldh);
+  return status;
+}
+
+static TValue *cpcall(lua_State *L, lua_CFunction func, void *ud)
+{
+  GCfunc *fn;
+  fn = lj_func_newC(L, 0, getcurrenv(L));
+  fn->c.f = func;
+  setfuncV(L, L->top, fn);
+  setlightudV(L->top+1, checklightudptr(L, ud));
+  cframe_nres(L->cframe) = 1+0;  /* Zero results. */
+  L->top += 2;
+  return L->top-1;  /* Now call the newly allocated C function. */
+}
+
+LUA_API int lua_cpcall(lua_State *L, lua_CFunction func, void *ud)
+{
+  global_State *g = G(L);
+  uint8_t oldh = hook_save(g);
+  int status = lj_vm_cpcall(L, cpcall, func, ud);
+  if (status) hook_restore(g, oldh);
+  return status;
+}
+
+LUALIB_API int luaL_callmeta(lua_State *L, int idx, const char *field)
+{
+  if (luaL_getmetafield(L, idx, field)) {
+    TValue *base = L->top--;
+    copyTV(L, base, index2adr(L, idx));
+    L->top = base+1;
+    lj_vm_call(L, base, 1+1);
+    return 1;
+  }
+  return 0;
+}
+
+/* -- Coroutine yield and resume ------------------------------------------ */
+
+LUA_API int lua_yield(lua_State *L, int nresults)
+{
+  void *cf = L->cframe;
+  cTValue *f;
+  if (!cframe_canyield(cf))
+    lj_err_msg(L, LJ_ERR_CYIELD);
+  f = L->top - nresults;
+  if (f > L->base) {
+    TValue *t = L->base;
+    while (--nresults >= 0) copyTV(L, t++, f++);
+    L->top = t;
+  }
+  L->cframe = NULL;
+  L->status = LUA_YIELD;
+  lj_vm_unwind_c(cf, LUA_YIELD);
+  return -1;  /* unreachable */
+}
+
+LUA_API int lua_resume(lua_State *L, int nargs)
+{
+  if (L->cframe == NULL && L->status <= LUA_YIELD)
+    return lj_vm_resume(L, L->top - nargs, 0, 0);
+  L->top = L->base;
+  setstrV(L, L->top, lj_err_str(L, LJ_ERR_COSUSP));
+  incr_top(L);
+  return LUA_ERRRUN;
+}
+
+/* -- Load and dump Lua code ---------------------------------------------- */
+
+static TValue *cpparser(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  LexState *ls = cast(LexState *, ud);
+  GCfunc *fn;
+  UNUSED(dummy);
+  cframe_errfunc(L->cframe) = -1;  /* Inherit error function. */
+  lj_lex_start(L, ls);
+  fn = lj_func_newL(L, lj_parse(ls), tabref(L->env));
+  /* Parser may realloc stack. Don't combine above/below into one statement. */
+  setfuncV(L, L->top++, fn);
+  return NULL;
+}
+
+LUA_API int lua_load(lua_State *L, lua_Reader reader, void *data,
+		     const char *chunkname)
+{
+  LexState ls;
+  int status;
+  global_State *g;
+  ls.rfunc = reader;
+  ls.rdata = data;
+  ls.chunkarg = chunkname ? chunkname : "?";
+  lj_str_initbuf(L, &ls.sb);
+  status = lj_vm_cpcall(L, cpparser, NULL, &ls);
+  g = G(L);
+  lj_str_freebuf(g, &ls.sb);
+  lj_gc_check(L);
+  return status;
+}
+
+LUA_API int lua_dump(lua_State *L, lua_Writer writer, void *data)
+{
+  api_checknelems(L, 1);
+  UNUSED(L); UNUSED(writer); UNUSED(data);
+  return 1;  /* Error, not supported. */
+}
+
+/* -- GC and memory management -------------------------------------------- */
+
+LUA_API int lua_gc(lua_State *L, int what, int data)
+{
+  global_State *g = G(L);
+  int res = 0;
+  switch (what) {
+  case LUA_GCSTOP:
+    g->gc.threshold = LJ_MAX_MEM;
+    break;
+  case LUA_GCRESTART:
+    g->gc.threshold = g->gc.total;
+    break;
+  case LUA_GCCOLLECT:
+    lj_gc_fullgc(L);
+    break;
+  case LUA_GCCOUNT:
+    res = cast_int(g->gc.total >> 10);
+    break;
+  case LUA_GCCOUNTB:
+    res = cast_int(g->gc.total & 0x3ff);
+    break;
+  case LUA_GCSTEP: {
+    MSize a = (MSize)data << 10;
+    g->gc.threshold = (a <= g->gc.total) ? (g->gc.total - a) : 0;
+    while (g->gc.total >= g->gc.threshold)
+      if (lj_gc_step(L)) {
+	res = 1;
+	break;
+      }
+    break;
+  }
+  case LUA_GCSETPAUSE:
+    res = cast_int(g->gc.pause);
+    g->gc.pause = (MSize)data;
+    break;
+  case LUA_GCSETSTEPMUL:
+    res = cast_int(g->gc.stepmul);
+    g->gc.stepmul = (MSize)data;
+    break;
+  default:
+    res = -1;  /* Invalid option. */
+  }
+  return res;
+}
+
+LUA_API lua_Alloc lua_getallocf(lua_State *L, void **ud)
+{
+  global_State *g = G(L);
+  if (ud) *ud = g->allocd;
+  return g->allocf;
+}
+
+LUA_API void lua_setallocf(lua_State *L, lua_Alloc f, void *ud)
+{
+  global_State *g = G(L);
+  g->allocd = ud;
+  g->allocf = f;
+}
+

+ 88 - 0
src/lj_arch.h

@@ -0,0 +1,88 @@
+/*
+** Target architecture selection.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_ARCH_H
+#define _LJ_ARCH_H
+
+#include "lua.h"
+
+
+/* Target endianess. */
+#define LUAJIT_LE	0
+#define LUAJIT_BE	1
+
+/* Target architectures. */
+#define LUAJIT_ARCH_X86		1
+#define LUAJIT_ARCH_x86		1
+#define LUAJIT_ARCH_X64		2
+#define LUAJIT_ARCH_x64		2
+
+
+/* Select native target if no target defined. */
+#ifndef LUAJIT_TARGET
+
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#define LUAJIT_TARGET	LUAJIT_ARCH_X86
+#elif defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define LUAJIT_TARGET	LUAJIT_ARCH_X64
+#else
+#error "No support for this architecture (yet)"
+#endif
+
+#endif
+
+/* Set target properties. */
+#if LUAJIT_TARGET == LUAJIT_ARCH_X86
+#define LJ_ARCH_NAME		"x86"
+#define LJ_ARCH_BITS		32
+#define LJ_ARCH_ENDIAN		LUAJIT_LE
+#define LJ_TARGET_X86		1
+#define LJ_TARGET_X86ORX64	1
+#define LJ_PAGESIZE		4096
+#elif LUAJIT_TARGET == LUAJIT_ARCH_X64
+#define LJ_ARCH_NAME		"x64"
+#define LJ_ARCH_BITS		64
+#define LJ_ARCH_ENDIAN		LUAJIT_LE
+#define LJ_TARGET_X64		1
+#define LJ_TARGET_X86ORX64	1
+#define LJ_PAGESIZE		4096
+#error "No support for x64 architecture (yet)"
+#else
+#error "No target architecture defined"
+#endif
+
+/* Disable or enable the JIT compiler. */
+#if defined(LUAJIT_DISABLE_JIT) || defined(LJ_ARCH_NOJIT)
+#define LJ_HASJIT		0
+#else
+#define LJ_HASJIT		1
+#endif
+
+#if LJ_ARCH_ENDIAN == LUAJIT_BE
+#define LJ_ENDIAN_SELECT(le, be)	be
+#define LJ_ENDIAN_LOHI(lo, hi)		hi lo
+#else
+#define LJ_ENDIAN_SELECT(le, be)	le
+#define LJ_ENDIAN_LOHI(lo, hi)		lo hi
+#endif
+
+#if LJ_ARCH_BITS == 32
+#define LJ_32			1
+#define LJ_64			0
+#elif LJ_ARCH_BITS == 64
+#define LJ_32			0
+#define LJ_64			1
+#else
+#error "Bad LJ_ARCH_BITS setting"
+#endif
+
+/* Whether target CPU masks the shift count by the operand length or not. */
+#if LJ_TARGET_X86ORX64
+#define LJ_TARGET_MASKEDSHIFT	1
+#else
+#define LJ_TARGET_MASKEDSHIFT	0
+#endif
+
+#endif

+ 3324 - 0
src/lj_asm.c

@@ -0,0 +1,3324 @@
+/*
+** IR assembler (SSA IR -> machine code).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_asm_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_mcode.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_snap.h"
+#include "lj_asm.h"
+#include "lj_dispatch.h"
+#include "lj_vm.h"
+#include "lj_target.h"
+
+/* -- Assembler state and common macros ----------------------------------- */
+
+/* Assembler state. */
+typedef struct ASMState {
+  RegCost cost[RID_MAX];  /* Reference and blended allocation cost for regs. */
+
+  MCode *mcp;		/* Current MCode pointer (grows down). */
+  MCode *mclim;		/* Lower limit for MCode memory + red zone. */
+
+  IRIns *ir;		/* Copy of pointer to IR instructions/constants. */
+  jit_State *J;		/* JIT compiler state. */
+
+  x86ModRM mrm;		/* Fused x86 address operand. */
+
+  RegSet freeset;	/* Set of free registers. */
+  RegSet modset;	/* Set of registers modified inside the loop. */
+  RegSet phiset;	/* Set of PHI registers. */
+
+  uint32_t flags;	/* Copy of JIT compiler flags. */
+  int loopinv;		/* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */
+
+  int32_t evenspill;	/* Next even spill slot. */
+  int32_t oddspill;	/* Next odd spill slot (or 0). */
+
+  IRRef curins;		/* Reference of current instruction. */
+  IRRef stopins;	/* Stop assembly before hitting this instruction. */
+  IRRef orignins;	/* Original T->nins. */
+
+  IRRef snapref;	/* Current snapshot is active after this reference. */
+  IRRef snaprename;	/* Rename highwater mark for snapshot check. */
+  SnapNo snapno;	/* Current snapshot number. */
+  SnapNo loopsnapno;	/* Loop snapshot number. */
+
+  Trace *T;		/* Trace to assemble. */
+  Trace *parent;	/* Parent trace (or NULL). */
+
+  IRRef fuseref;	/* Fusion limit (loopref, 0 or FUSE_DISABLED). */
+  IRRef sectref;	/* Section base reference (loopref or 0). */
+  IRRef loopref;	/* Reference of LOOP instruction (or 0). */
+
+  BCReg topslot;	/* Number of slots for stack check (unless 0). */
+  MSize gcsteps;	/* Accumulated number of GC steps (per section). */
+
+  MCode *mcbot;		/* Bottom of reserved MCode. */
+  MCode *mctop;		/* Top of generated MCode. */
+  MCode *mcloop;	/* Pointer to loop MCode (or NULL). */
+  MCode *invmcp;	/* Points to invertible loop branch (or NULL). */
+  MCode *testmcp;	/* Pending opportunity to remove test r,r. */
+  MCode *realign;	/* Realign loop if not NULL. */
+
+  IRRef1 phireg[RID_MAX];  /* PHI register references. */
+  uint16_t parentmap[LJ_MAX_JSLOTS];  /* Parent slot to RegSP map. */
+} ASMState;
+
+#define IR(ref)			(&as->ir[(ref)])
+
+/* Check for variant to invariant references. */
+#define iscrossref(as, ref)	((ref) < as->sectref)
+
+/* Inhibit memory op fusion from variant to invariant references. */
+#define FUSE_DISABLED		(~(IRRef)0)
+#define mayfuse(as, ref)	((ref) > as->fuseref)
+#define neverfuse(as)		(as->fuseref == FUSE_DISABLED)
+#define opisfusableload(o) \
+  ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \
+   (o) == IR_FLOAD || (o) == IR_SLOAD || (o) == IR_XLOAD)
+
+/* Instruction selection for XMM moves. */
+#define XMM_MOVRR(as)	((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS)
+#define XMM_MOVRM(as)	((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD)
+
+/* Sparse limit checks using a red zone before the actual limit. */
+#define MCLIM_REDZONE	64
+#define checkmclim(as) \
+  if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as)
+
+static LJ_NORET LJ_NOINLINE void asm_mclimit(ASMState *as)
+{
+  lj_mcode_limiterr(as->J, (size_t)(as->mctop - as->mcp + 4*MCLIM_REDZONE));
+}
+
+/* -- Emit x86 instructions ----------------------------------------------- */
+
+#define MODRM(mode, r1, r2)	((MCode)((mode)+(((r1)&7)<<3)+((r2)&7)))
+
+#if LJ_64
+#define REXRB(p, rr, rb) \
+    { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
+      if (rex != 0x40) *--(p) = rex; }
+#define FORCE_REX		0x200
+#else
+#define REXRB(p, rr, rb)	((void)0)
+#define FORCE_REX		0
+#endif
+
+#define emit_i8(as, i)		(*--as->mcp = (MCode)(i))
+#define emit_i32(as, i)		(*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4)
+
+#define emit_x87op(as, xo) \
+  (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2)
+
+/* op */
+static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
+				 MCode *p, int delta)
+{
+  int n = (int8_t)xo;
+#if defined(__GNUC__)
+  if (__builtin_constant_p(xo) && n == -2)
+    p[delta-2] = (MCode)(xo >> 24);
+  else if (__builtin_constant_p(xo) && n == -3)
+    *(uint16_t *)(p+delta-3) = (uint16_t)(xo >> 16);
+  else
+#endif
+    *(uint32_t *)(p+delta-5) = (uint32_t)xo;
+  p += n + delta;
+#if LJ_64
+  {
+    uint32_t rex = 0x40 + ((rr>>1)&(4+(FORCE_REX>>1)))+((rx>>2)&2)+((rb>>3)&1);
+    if (rex != 0x40) {
+      if (n == -4) { *p = (MCode)rex; rex = (MCode)(xo >> 8); }
+      *--p = (MCode)rex;
+    }
+  }
+#else
+  UNUSED(rr); UNUSED(rb); UNUSED(rx);
+#endif
+  return p;
+}
+
+/* op + modrm */
+#define emit_opm(xo, mode, rr, rb, p, delta) \
+  (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
+   emit_op((xo), (rr), (rb), 0, (p), (delta)))
+
+/* op + modrm + sib */
+#define emit_opmx(xo, mode, scale, rr, rb, rx, p) \
+  (p[-1] = MODRM((scale), (rx), (rb)), \
+   p[-2] = MODRM((mode), (rr), RID_ESP), \
+   emit_op((xo), (rr), (rb), (rx), (p), -1))
+
+/* op r1, r2 */
+static void emit_rr(ASMState *as, x86Op xo, Reg r1, Reg r2)
+{
+  MCode *p = as->mcp;
+  as->mcp = emit_opm(xo, XM_REG, r1, r2, p, 0);
+}
+
+#if LJ_64 && defined(LUA_USE_ASSERT)
+/* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */
+static int32_t ptr2addr(void *p)
+{
+  lua_assert((uintptr_t)p < (uintptr_t)0x80000000);
+  return i32ptr(p);
+}
+#else
+#define ptr2addr(p)	(i32ptr((p)))
+#endif
+
+/* op r, [addr] */
+static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr)
+{
+  MCode *p = as->mcp;
+  *(int32_t *)(p-4) = ptr2addr(addr);
+#if LJ_64
+  p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
+  as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5);
+#else
+  as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4);
+#endif
+}
+
+/* op r, [base+ofs] */
+static void emit_rmro(ASMState *as, x86Op xo, Reg rr, Reg rb, int32_t ofs)
+{
+  MCode *p = as->mcp;
+  x86Mode mode;
+  if (ra_hasreg(rb)) {
+    if (ofs == 0 && (rb&7) != RID_EBP) {
+      mode = XM_OFS0;
+    } else if (checki8(ofs)) {
+      *--p = (MCode)ofs;
+      mode = XM_OFS8;
+    } else {
+      p -= 4;
+      *(int32_t *)p = ofs;
+      mode = XM_OFS32;
+    }
+    if ((rb&7) == RID_ESP)
+      *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
+  } else {
+    *(int32_t *)(p-4) = ofs;
+#if LJ_64
+    p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
+    p -= 5;
+    rb = RID_ESP;
+#else
+    p -= 4;
+    rb = RID_EBP;
+#endif
+    mode = XM_OFS0;
+  }
+  as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
+}
+
+/* op r, [base+idx*scale+ofs] */
+static void emit_rmrxo(ASMState *as, x86Op xo, Reg rr, Reg rb, Reg rx,
+		       x86Mode scale, int32_t ofs)
+{
+  MCode *p = as->mcp;
+  x86Mode mode;
+  if (ofs == 0 && (rb&7) != RID_EBP) {
+    mode = XM_OFS0;
+  } else if (checki8(ofs)) {
+    mode = XM_OFS8;
+    *--p = (MCode)ofs;
+  } else {
+    mode = XM_OFS32;
+    p -= 4;
+    *(int32_t *)p = ofs;
+  }
+  as->mcp = emit_opmx(xo, mode, scale, rr, rb, rx, p);
+}
+
+/* op r, i */
+static void emit_gri(ASMState *as, x86Group xg, Reg rb, int32_t i)
+{
+  MCode *p = as->mcp;
+  if (checki8(i)) {
+    p -= 3;
+    p[2] = (MCode)i;
+    p[0] = (MCode)(xg >> 16);
+  } else {
+    p -= 6;
+    *(int32_t *)(p+2) = i;
+    p[0] = (MCode)(xg >> 8);
+  }
+  p[1] = MODRM(XM_REG, xg, rb);
+  REXRB(p, 0, rb);
+  as->mcp = p;
+}
+
+/* op [base+ofs], i */
+static void emit_gmroi(ASMState *as, x86Group xg, Reg rb, int32_t ofs,
+		       int32_t i)
+{
+  x86Op xo;
+  if (checki8(i)) {
+    emit_i8(as, i);
+    xo = (x86Op)(((xg >> 16) << 24)+0xfe);
+  } else {
+    emit_i32(as, i);
+    xo = (x86Op)(((xg >> 8) << 24)+0xfe);
+  }
+  emit_rmro(as, xo, (Reg)xg, rb, ofs);
+}
+
+#define emit_shifti(as, xg, r, i) \
+  (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r)))
+
+/* op r, rm/mrm */
+static void emit_mrm(ASMState *as, x86Op xo, Reg rr, Reg rb)
+{
+  MCode *p = as->mcp;
+  x86Mode mode = XM_REG;
+  if (rb == RID_MRM) {
+    rb = as->mrm.base;
+    if (rb == RID_NONE) {
+      rb = RID_EBP;
+      mode = XM_OFS0;
+      p -= 4;
+      *(int32_t *)p = as->mrm.ofs;
+      if (as->mrm.idx != RID_NONE)
+	goto mrmidx;
+#if LJ_64
+      *--p = MODRM(XM_SCALE1, RID_ESP, RID_EBP);
+      rb = RID_ESP;
+#endif
+    } else {
+      if (as->mrm.ofs == 0 && (rb&7) != RID_EBP) {
+	mode = XM_OFS0;
+      } else if (checki8(as->mrm.ofs)) {
+	*--p = (MCode)as->mrm.ofs;
+	mode = XM_OFS8;
+      } else {
+	p -= 4;
+	*(int32_t *)p = as->mrm.ofs;
+	mode = XM_OFS32;
+      }
+      if (as->mrm.idx != RID_NONE) {
+      mrmidx:
+	as->mcp = emit_opmx(xo, mode, as->mrm.scale, rr, rb, as->mrm.idx, p);
+	return;
+      }
+      if ((rb&7) == RID_ESP)
+	*--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
+    }
+  }
+  as->mcp = emit_opm(xo, mode, rr, rb, p, 0);
+}
+
+static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
+{
+  if (ofs) {
+    if ((as->flags & JIT_F_LEA_AGU))
+      emit_rmro(as, XO_LEA, r, r, ofs);
+    else
+      emit_gri(as, XG_ARITHi(XOg_ADD), r, ofs);
+  }
+}
+
+/* -- Emit moves ---------------------------------------------------------- */
+
+/* Generic move between two regs. */
+static void emit_movrr(ASMState *as, Reg r1, Reg r2)
+{
+  emit_rr(as, r1 < RID_MAX_GPR ? XO_MOV : XMM_MOVRR(as), r1, r2);
+}
+
+/* Generic move from [base+ofs]. */
+static void emit_movrmro(ASMState *as, Reg rr, Reg rb, int32_t ofs)
+{
+  emit_rmro(as, rr < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), rr, rb, ofs);
+}
+
+/* mov [base+ofs], i */
+static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
+{
+  emit_i32(as, i);
+  emit_rmro(as, XO_MOVmi, 0, base, ofs);
+}
+
+/* mov [base+ofs], r */
+#define emit_movtomro(as, r, base, ofs) \
+  emit_rmro(as, XO_MOVto, (r), (base), (ofs))
+
+/* Get/set global_State fields. */
+#define emit_opgl(as, xo, r, field) \
+  emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field)
+#define emit_getgl(as, r, field)	emit_opgl(as, XO_MOV, (r), field)
+#define emit_setgl(as, r, field)	emit_opgl(as, XO_MOVto, (r), field)
+#define emit_setgli(as, field, i) \
+  (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field))
+
+/* mov r, i / xor r, r */
+static void emit_loadi(ASMState *as, Reg r, int32_t i)
+{
+  if (i == 0) {
+    emit_rr(as, XO_ARITH(XOg_XOR), r, r);
+  } else {
+    MCode *p = as->mcp;
+    *(int32_t *)(p-4) = i;
+    p[-5] = (MCode)(XI_MOVri+(r&7));
+    p -= 5;
+    REXRB(p, 0, r);
+    as->mcp = p;
+  }
+}
+
+/* mov r, addr */
+#define emit_loada(as, r, addr) \
+  emit_loadi(as, (r), ptr2addr((addr)))
+
+/* movsd r, [&tv->n] / xorps r, r */
+static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
+{
+  if (tvispzero(tv))  /* Use xor only for +0. */
+    emit_rr(as, XO_XORPS, r, r);
+  else
+    emit_rma(as, XMM_MOVRM(as), r, &tv->n);
+}
+
+/* -- Emit branches ------------------------------------------------------- */
+
+/* Label for short jumps. */
+typedef MCode *MCLabel;
+
+/* jcc short target */
+static void emit_sjcc(ASMState *as, int cc, MCLabel target)
+{
+  MCode *p = as->mcp;
+  p[-1] = (MCode)(int8_t)(target-p);
+  p[-2] = (MCode)(XI_JCCs+(cc&15));
+  as->mcp = p - 2;
+}
+
+/* jcc short (pending target) */
+static MCLabel emit_sjcc_label(ASMState *as, int cc)
+{
+  MCode *p = as->mcp;
+  p[-1] = 0;
+  p[-2] = (MCode)(XI_JCCs+(cc&15));
+  as->mcp = p - 2;
+  return p;
+}
+
+/* Fixup jcc short target. */
+static void emit_sfixup(ASMState *as, MCLabel source)
+{
+  source[-1] = (MCode)(as->mcp-source);
+}
+
+/* Return label pointing to current PC. */
+#define emit_label(as)		((as)->mcp)
+
+/* jcc target */
+static void emit_jcc(ASMState *as, int cc, MCode *target)
+{
+  MCode *p = as->mcp;
+  int32_t addr = (int32_t)(target - p);
+  *(int32_t *)(p-4) = addr;
+  p[-5] = (MCode)(XI_JCCn+(cc&15));
+  p[-6] = 0x0f;
+  as->mcp = p - 6;
+}
+
+/* call target */
+static void emit_call_(ASMState *as, MCode *target)
+{
+  MCode *p = as->mcp;
+  *(int32_t *)(p-4) = (int32_t)(target - p);
+  p[-5] = XI_CALL;
+  as->mcp = p - 5;
+}
+
+#define emit_call(as, f)	emit_call_(as, (MCode *)(void *)(f))
+
+/* Argument setup for C calls. Up to 3 args need no stack adjustment. */
+#define emit_setargr(as, narg, r) \
+  emit_movtomro(as, (r), RID_ESP, ((narg)-1)*4);
+#define emit_setargi(as, narg, imm) \
+  emit_movmroi(as, RID_ESP, ((narg)-1)*4, (imm))
+#define emit_setargp(as, narg, ptr) \
+  emit_setargi(as, (narg), ptr2addr((ptr)))
+
+/* -- Register allocator debugging ---------------------------------------- */
+
+/* #define LUAJIT_DEBUG_RA */
+
+#ifdef LUAJIT_DEBUG_RA
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#define RIDNAME(name)	#name,
+static const char *const ra_regname[] = {
+  GPRDEF(RIDNAME)
+  FPRDEF(RIDNAME)
+  "mrm",
+  NULL
+};
+#undef RIDNAME
+
+static char ra_dbg_buf[65536];
+static char *ra_dbg_p;
+static char *ra_dbg_merge;
+static MCode *ra_dbg_mcp;
+
+static void ra_dstart(void)
+{
+  ra_dbg_p = ra_dbg_buf;
+  ra_dbg_merge = NULL;
+  ra_dbg_mcp = NULL;
+}
+
+static void ra_dflush(void)
+{
+  fwrite(ra_dbg_buf, 1, (size_t)(ra_dbg_p-ra_dbg_buf), stdout);
+  ra_dstart();
+}
+
+static void ra_dprintf(ASMState *as, const char *fmt, ...)
+{
+  char *p;
+  va_list argp;
+  va_start(argp, fmt);
+  p = ra_dbg_mcp == as->mcp ? ra_dbg_merge : ra_dbg_p;
+  ra_dbg_mcp = NULL;
+  p += sprintf(p, "%08x  \e[36m%04d ", (uintptr_t)as->mcp, as->curins-REF_BIAS);
+  for (;;) {
+    const char *e = strchr(fmt, '$');
+    if (e == NULL) break;
+    memcpy(p, fmt, (size_t)(e-fmt));
+    p += e-fmt;
+    if (e[1] == 'r') {
+      Reg r = va_arg(argp, Reg) & RID_MASK;
+      if (r <= RID_MAX) {
+	const char *q;
+	for (q = ra_regname[r]; *q; q++)
+	  *p++ = *q >= 'A' && *q <= 'Z' ? *q + 0x20 : *q;
+      } else {
+	*p++ = '?';
+	lua_assert(0);
+      }
+    } else if (e[1] == 'f' || e[1] == 'i') {
+      IRRef ref;
+      if (e[1] == 'f')
+	ref = va_arg(argp, IRRef);
+      else
+	ref = va_arg(argp, IRIns *) - as->ir;
+      if (ref >= REF_BIAS)
+	p += sprintf(p, "%04d", ref - REF_BIAS);
+      else
+	p += sprintf(p, "K%03d", REF_BIAS - ref);
+    } else if (e[1] == 's') {
+      uint32_t slot = va_arg(argp, uint32_t);
+      p += sprintf(p, "[esp+0x%x]", sps_scale(slot));
+    } else {
+      lua_assert(0);
+    }
+    fmt = e+2;
+  }
+  va_end(argp);
+  while (*fmt)
+    *p++ = *fmt++;
+  *p++ = '\e'; *p++ = '['; *p++ = 'm'; *p++ = '\n';
+  if (p > ra_dbg_buf+sizeof(ra_dbg_buf)-256) {
+    fwrite(ra_dbg_buf, 1, (size_t)(p-ra_dbg_buf), stdout);
+    p = ra_dbg_buf;
+  }
+  ra_dbg_p = p;
+}
+
+#define RA_DBG_START()	ra_dstart()
+#define RA_DBG_FLUSH()	ra_dflush()
+#define RA_DBG_REF() \
+  do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \
+       ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0)
+#define RA_DBGX(x)	ra_dprintf x
+
+#else
+#define RA_DBG_START()	((void)0)
+#define RA_DBG_FLUSH()	((void)0)
+#define RA_DBG_REF()	((void)0)
+#define RA_DBGX(x)	((void)0)
+#endif
+
+/* -- Register allocator -------------------------------------------------- */
+
+#define ra_free(as, r)		rset_set(as->freeset, (r))
+#define ra_modified(as, r)	rset_set(as->modset, (r))
+
+#define ra_used(ir)		(ra_hasreg((ir)->r) || ra_hasspill((ir)->s))
+
+/* Setup register allocator. */
+static void ra_setup(ASMState *as)
+{
+  /* Initially all regs (except the stack pointer) are free for use. */
+  as->freeset = RSET_ALL;
+  as->modset = RSET_EMPTY;
+  as->phiset = RSET_EMPTY;
+  memset(as->phireg, 0, sizeof(as->phireg));
+  memset(as->cost, 0, sizeof(as->cost));
+  as->cost[RID_ESP] = REGCOST(~0u, 0u);
+
+  /* Start slots for spill slot allocation. */
+  as->evenspill = (SPS_FIRST+1)&~1;
+  as->oddspill = (SPS_FIRST&1) ? SPS_FIRST : 0;
+}
+
+/* Rematerialize constants. */
+static Reg ra_rematk(ASMState *as, IRIns *ir)
+{
+  Reg r = ir->r;
+  lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
+  ra_free(as, r);
+  ra_modified(as, r);
+  ir->r = RID_INIT;  /* Do not keep any hint. */
+  RA_DBGX((as, "remat     $i $r", ir, r));
+  if (ir->o == IR_KNUM) {
+    emit_loadn(as, r, ir_knum(ir));
+  } else if (ir->o == IR_BASE) {
+    ra_sethint(ir->r, RID_BASE);  /* Restore BASE register hint. */
+    emit_getgl(as, r, jit_base);
+  } else {
+    lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
+	       ir->o == IR_KPTR || ir->o == IR_KNULL);
+    emit_loadi(as, r, ir->i);
+  }
+  return r;
+}
+
+/* Force a spill. Allocate a new spill slot if needed. */
+static int32_t ra_spill(ASMState *as, IRIns *ir)
+{
+  int32_t slot = ir->s;
+  if (!ra_hasspill(slot)) {
+    if (irt_isnum(ir->t)) {
+      slot = as->evenspill;
+      as->evenspill += 2;
+    } else if (as->oddspill) {
+      slot = as->oddspill;
+      as->oddspill = 0;
+    } else {
+      slot = as->evenspill;
+      as->oddspill = slot+1;
+      as->evenspill += 2;
+    }
+    if (as->evenspill > 256)
+      lj_trace_err(as->J, LJ_TRERR_SPILLOV);
+    ir->s = (uint8_t)slot;
+  }
+  return sps_scale(slot);
+}
+
+/* Restore a register (marked as free). Rematerialize or force a spill. */
+static Reg ra_restore(ASMState *as, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (irref_isk(ref) || ref == REF_BASE) {
+    return ra_rematk(as, ir);
+  } else {
+    Reg r = ir->r;
+    lua_assert(ra_hasreg(r));
+    ra_free(as, r);
+    ra_modified(as, r);
+    ra_sethint(ir->r, r);  /* Keep hint. */
+    RA_DBGX((as, "restore   $i $r", ir, r));
+    emit_movrmro(as, r, RID_ESP, ra_spill(as, ir));  /* Force a spill. */
+    return r;
+  }
+}
+
+/* Save a register to a spill slot. */
+static LJ_AINLINE void ra_save(ASMState *as, IRIns *ir, Reg r)
+{
+  RA_DBGX((as, "save      $i $r", ir, r));
+  emit_rmro(as, r < RID_MAX_GPR ? XO_MOVto : XO_MOVSDto,
+	    r, RID_ESP, sps_scale(ir->s));
+}
+
+#define MINCOST(r) \
+  if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \
+    cost = as->cost[r]
+
+/* Evict the register with the lowest cost, forcing a restore. */
+static Reg ra_evict(ASMState *as, RegSet allow)
+{
+  RegCost cost = ~(RegCost)0;
+  if (allow < RID2RSET(RID_MAX_GPR)) {
+    MINCOST(RID_EAX);MINCOST(RID_ECX);MINCOST(RID_EDX);MINCOST(RID_EBX);
+    MINCOST(RID_EBP);MINCOST(RID_ESI);MINCOST(RID_EDI);
+#if LJ_64
+    MINCOST(RID_R8D);MINCOST(RID_R9D);MINCOST(RID_R10D);MINCOST(RID_R11D);
+    MINCOST(RID_R12D);MINCOST(RID_R13D);MINCOST(RID_R14D);MINCOST(RID_R15D);
+#endif
+  } else {
+    MINCOST(RID_XMM0);MINCOST(RID_XMM1);MINCOST(RID_XMM2);MINCOST(RID_XMM3);
+    MINCOST(RID_XMM4);MINCOST(RID_XMM5);MINCOST(RID_XMM6);MINCOST(RID_XMM7);
+#if LJ_64
+    MINCOST(RID_XMM8);MINCOST(RID_XMM9);MINCOST(RID_XMM10);MINCOST(RID_XMM11);
+    MINCOST(RID_XMM12);MINCOST(RID_XMM13);MINCOST(RID_XMM14);MINCOST(RID_XMM15);
+#endif
+  }
+  lua_assert(allow != RSET_EMPTY);
+  lua_assert(regcost_ref(cost) >= as->T->nk && regcost_ref(cost) < as->T->nins);
+  return ra_restore(as, regcost_ref(cost));
+}
+
+/* Pick any register (marked as free). Evict on-demand. */
+static LJ_AINLINE Reg ra_pick(ASMState *as, RegSet allow)
+{
+  RegSet pick = as->freeset & allow;
+  if (!pick)
+    return ra_evict(as, allow);
+  else
+    return rset_picktop(pick);
+}
+
+/* Get a scratch register (marked as free). */
+static LJ_AINLINE Reg ra_scratch(ASMState *as, RegSet allow)
+{
+  Reg r = ra_pick(as, allow);
+  ra_modified(as, r);
+  RA_DBGX((as, "scratch        $r", r));
+  return r;
+}
+
+/* Evict all registers from a set (if not free). */
+static void ra_evictset(ASMState *as, RegSet drop)
+{
+  as->modset |= drop;
+  drop &= ~as->freeset;
+  while (drop) {
+    Reg r = rset_picktop(drop);
+    ra_restore(as, regcost_ref(as->cost[r]));
+    rset_clear(drop, r);
+    checkmclim(as);
+  }
+}
+
+/* Allocate a register for ref from the allowed set of registers.
+** Note: this function assumes the ref does NOT have a register yet!
+** Picks an optimal register, sets the cost and marks the register as non-free.
+*/
+static Reg ra_allocref(ASMState *as, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  RegSet pick = as->freeset & allow;
+  Reg r;
+  lua_assert(ra_noreg(ir->r));
+  if (pick) {
+    /* First check register hint from propagation or PHI. */
+    if (ra_hashint(ir->r)) {
+      r = ra_gethint(ir->r);
+      if (rset_test(pick, r))  /* Use hint register if possible. */
+	goto found;
+      /* Rematerialization is cheaper than missing a hint. */
+      if (rset_test(allow, r) && irref_isk(regcost_ref(as->cost[r]))) {
+	ra_rematk(as, IR(regcost_ref(as->cost[r])));
+	goto found;
+      }
+      RA_DBGX((as, "hintmiss  $f $r", ref, r));
+    }
+    /* Invariants should preferably get unused registers. */
+    if (ref < as->loopref && !irt_isphi(ir->t))
+      r = rset_pickbot(pick);
+    else
+      r = rset_picktop(pick);
+  } else {
+    r = ra_evict(as, allow);
+  }
+found:
+  RA_DBGX((as, "alloc     $f $r", ref, r));
+  ir->r = (uint8_t)r;
+  rset_clear(as->freeset, r);
+  as->cost[r] = REGCOST_REF_T(ref, irt_t(ir->t));
+  return r;
+}
+
+/* Allocate a register on-demand. */
+static LJ_INLINE Reg ra_alloc1(ASMState *as, IRRef ref, RegSet allow)
+{
+  Reg r = IR(ref)->r;
+  /* Note: allow is ignored if the register is already allocated. */
+  if (ra_noreg(r)) r = ra_allocref(as, ref, allow);
+  return r;
+}
+
+/* Rename register allocation and emit move. */
+static void ra_rename(ASMState *as, Reg down, Reg up)
+{
+  IRRef ren, ref = regcost_ref(as->cost[up] = as->cost[down]);
+  IR(ref)->r = (uint8_t)up;
+  as->cost[down] = 0;
+  lua_assert((down < RID_MAX_GPR) == (up < RID_MAX_GPR));
+  lua_assert(!rset_test(as->freeset, down) && rset_test(as->freeset, up));
+  rset_set(as->freeset, down);  /* 'down' is free ... */
+  rset_clear(as->freeset, up);  /* ... and 'up' is now allocated. */
+  RA_DBGX((as, "rename    $f $r $r", regcost_ref(as->cost[up]), down, up));
+  emit_movrr(as, down, up);  /* Backwards code generation needs inverse move. */
+  if (!ra_hasspill(IR(ref)->s)) {  /* Add the rename to the IR. */
+    lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, as->snapno);
+    ren = tref_ref(lj_ir_emit(as->J));
+    as->ir = as->T->ir;  /* The IR may have been reallocated. */
+    IR(ren)->r = (uint8_t)down;
+    IR(ren)->s = SPS_NONE;
+  }
+}
+
+/* Pick a destination register (marked as free).
+** Caveat: allow is ignored if there's already a destination register.
+** Use ra_destreg() to get a specific register.
+*/
+static Reg ra_dest(ASMState *as, IRIns *ir, RegSet allow)
+{
+  Reg dest = ir->r;
+  if (ra_hasreg(dest)) {
+    ra_free(as, dest);
+    ra_modified(as, dest);
+  } else {
+    dest = ra_scratch(as, allow);
+  }
+  if (LJ_UNLIKELY(ra_hasspill(ir->s))) ra_save(as, ir, dest);
+  return dest;
+}
+
+/* Force a specific destination register (marked as free). */
+static void ra_destreg(ASMState *as, IRIns *ir, Reg r)
+{
+  Reg dest = ra_dest(as, ir, RID2RSET(r));
+  if (dest != r) {
+    ra_scratch(as, RID2RSET(r));
+    emit_movrr(as, dest, r);
+  }
+}
+
+/* Propagate dest register to left reference. Emit moves as needed.
+** This is a required fixup step for all 2-operand machine instructions.
+*/
+static void ra_left(ASMState *as, Reg dest, IRRef lref)
+{
+  IRIns *ir = IR(lref);
+  Reg left = ir->r;
+  if (ra_noreg(left)) {
+    if (irref_isk(lref)) {
+      if (ir->o == IR_KNUM) {
+	cTValue *tv = ir_knum(ir);
+	/* FP remat needs a load except for +0. Still better than eviction. */
+	if (tvispzero(tv) || !(as->freeset & RSET_FPR)) {
+	  emit_loadn(as, dest, tv);
+	  return;
+	}
+      } else {
+	lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
+		   ir->o == IR_KPTR || ir->o == IR_KNULL);
+	emit_loadi(as, dest, ir->i);
+	return;
+      }
+    }
+    if (!ra_hashint(left) && !iscrossref(as, lref))
+      ra_sethint(ir->r, dest);  /* Propagate register hint. */
+    left = ra_allocref(as, lref, dest < RID_MAX_GPR ? RSET_GPR : RSET_FPR);
+  }
+  /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */
+  if (dest != left) {
+    /* Use register renaming if dest is the PHI reg. */
+    if (irt_isphi(ir->t) && as->phireg[dest] == lref) {
+      ra_modified(as, left);
+      ra_rename(as, left, dest);
+    } else {
+      emit_movrr(as, dest, left);
+    }
+  }
+}
+
+/* -- Exit stubs ---------------------------------------------------------- */
+
+/* Generate an exit stub group at the bottom of the reserved MCode memory. */
+static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
+{
+  ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff;
+  MCode *mxp = as->mcbot;
+  MCode *mxpstart = mxp;
+  if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop)
+    asm_mclimit(as);
+  /* Push low byte of exitno for each exit stub. */
+  *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs;
+  for (i = 1; i < EXITSTUBS_PER_GROUP; i++) {
+    *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2);
+    *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i);
+  }
+  /* Push the high byte of the exitno for each exit stub group. */
+  *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
+  /* Store DISPATCH in ExitInfo->dispatch. Account for the two push ops. */
+  *mxp++ = XI_MOVmi;
+  *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
+  *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
+  *mxp++ = 2*sizeof(void *);
+  *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
+  /* Jump to exit handler which fills in the ExitState. */
+  *mxp++ = XI_JMP; mxp += 4;
+  *((int32_t *)(mxp-4)) = (int32_t)((MCode *)lj_vm_exit_handler - mxp);
+  /* Commit the code for this group (even if assembly fails later on). */
+  lj_mcode_commitbot(as->J, mxp);
+  as->mcbot = mxp;
+  as->mclim = as->mcbot + MCLIM_REDZONE;
+  return mxpstart;
+}
+
+/* Setup all needed exit stubs. */
+static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
+{
+  ExitNo i;
+  if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
+    lj_trace_err(as->J, LJ_TRERR_SNAPOV);
+  for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
+    if (as->J->exitstubgroup[i] == NULL)
+      as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
+}
+
+/* -- Snapshot and guard handling ----------------------------------------- */
+
+/* Can we rematerialize a KNUM instead of forcing a spill? */
+static int asm_snap_canremat(ASMState *as)
+{
+  Reg r;
+  for (r = RID_MIN_FPR; r < RID_MAX_FPR; r++)
+    if (irref_isk(regcost_ref(as->cost[r])))
+      return 1;
+  return 0;
+}
+
+/* Allocate registers or spill slots for refs escaping to a snapshot. */
+static void asm_snap_alloc(ASMState *as)
+{
+  SnapShot *snap = &as->T->snap[as->snapno];
+  IRRef2 *map = &as->T->snapmap[snap->mapofs];
+  BCReg s, nslots = snap->nslots;
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (!irref_isk(ref)) {
+      IRIns *ir = IR(ref);
+      if (!ra_used(ir) && ir->o != IR_FRAME) {
+	RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+	/* Not a var-to-invar ref and got a free register (or a remat)? */
+	if ((!iscrossref(as, ref) || irt_isphi(ir->t)) &&
+	    ((as->freeset & allow) ||
+	     (allow == RSET_FPR && asm_snap_canremat(as)))) {
+	  ra_allocref(as, ref, allow);  /* Allocate a register. */
+	  checkmclim(as);
+	  RA_DBGX((as, "snapreg   $f $r", ref, ir->r));
+	} else {
+	  ra_spill(as, ir);  /* Otherwise force a spill slot. */
+	  RA_DBGX((as, "snapspill $f $s", ref, ir->s));
+	}
+      }
+    }
+  }
+}
+
+/* All guards for a snapshot use the same exitno. This is currently the
+** same as the snapshot number. Since the exact origin of the exit cannot
+** be determined, all guards for the same snapshot must exit with the same
+** RegSP mapping.
+** A renamed ref which has been used in a prior guard for the same snapshot
+** would cause an inconsistency. The easy way out is to force a spill slot.
+*/
+static int asm_snap_checkrename(ASMState *as, IRRef ren)
+{
+  SnapShot *snap = &as->T->snap[as->snapno];
+  IRRef2 *map = &as->T->snapmap[snap->mapofs];
+  BCReg s, nslots = snap->nslots;
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (ref == ren) {
+      IRIns *ir = IR(ref);
+      ra_spill(as, ir);  /* Register renamed, so force a spill slot. */
+      RA_DBGX((as, "snaprensp $f $s", ref, ir->s));
+      return 1;  /* Found. */
+    }
+  }
+  return 0;  /* Not found. */
+}
+
+/* Prepare snapshot for next guard instruction. */
+static void asm_snap_prep(ASMState *as)
+{
+  if (as->curins < as->snapref) {
+    do {
+      lua_assert(as->snapno != 0);
+      as->snapno--;
+      as->snapref = as->T->snap[as->snapno].ref;
+    } while (as->curins < as->snapref);
+    asm_snap_alloc(as);
+    as->snaprename = as->T->nins;
+  } else {
+    /* Process any renames above the highwater mark. */
+    for (; as->snaprename < as->T->nins; as->snaprename++) {
+      IRIns *ir = IR(as->snaprename);
+      if (asm_snap_checkrename(as, ir->op1))
+	ir->op2 = REF_BIAS-1;  /* Kill rename. */
+    }
+  }
+}
+
+/* Emit conditional branch to exit for guard.
+** It's important to emit this *after* all registers have been allocated,
+** because rematerializations may invalidate the flags.
+*/
+static void asm_guardcc(ASMState *as, int cc)
+{
+  MCode *target = exitstub_addr(as->J, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *(int32_t *)(p+1) = target - (p+5);
+    target = p;
+    cc ^= 1;
+    if (as->realign) {
+      emit_sjcc(as, cc, target);
+      return;
+    }
+  }
+  emit_jcc(as, cc, target);
+}
+
+/* -- Memory operand fusion ----------------------------------------------- */
+
+/* Arch-specific field offsets. */
+static const uint8_t field_ofs[IRFL__MAX+1] = {
+#define FLOFS(name, type, field)	(uint8_t)offsetof(type, field),
+IRFLDEF(FLOFS)
+#undef FLOFS
+  0
+};
+
+/* Limit linear search to this distance. Avoids O(n^2) behavior. */
+#define CONFLICT_SEARCH_LIM	15
+
+/* Check if there's no conflicting instruction between curins and ref. */
+static int noconflict(ASMState *as, IRRef ref, IROp conflict)
+{
+  IRIns *ir = as->ir;
+  IRRef i = as->curins;
+  if (i > ref + CONFLICT_SEARCH_LIM)
+    return 0;  /* Give up, ref is too far away. */
+  while (--i > ref)
+    if (ir[i].o == conflict)
+      return 0;  /* Conflict found. */
+  return 1;  /* Ok, no conflict. */
+}
+
+/* Fuse array reference into memory operand. */
+static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow)
+{
+  IRIns *irb = IR(ir->op1);
+  IRIns *ira, *irx;
+  lua_assert(ir->o == IR_AREF);
+  lua_assert(irb->o == IR_FLOAD && irb->op2 == IRFL_TAB_ARRAY);
+  ira = IR(irb->op1);
+  if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
+      noconflict(as, irb->op1, IR_NEWREF)) {
+    /* We can avoid the FLOAD of t->array for colocated arrays. */
+    as->mrm.base = (uint8_t)ra_alloc1(as, irb->op1, allow);  /* Table obj. */
+    as->mrm.ofs = -(int32_t)(ira->op1*sizeof(TValue));  /* Ofs to colo array. */
+  } else {
+    as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);  /* Array base. */
+    as->mrm.ofs = 0;
+  }
+  irx = IR(ir->op2);
+  if (irref_isk(ir->op2)) {
+    as->mrm.ofs += 8*irx->i;
+    as->mrm.idx = RID_NONE;
+  } else {
+    rset_clear(allow, as->mrm.base);
+    as->mrm.scale = XM_SCALE8;
+    /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
+    ** Doesn't help much without ABCelim, but reduces register pressure.
+    */
+    if (mayfuse(as, ir->op2) && ra_noreg(irx->r) &&
+	irx->o == IR_ADD && irref_isk(irx->op2)) {
+      as->mrm.ofs += 8*IR(irx->op2)->i;
+      as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow);
+    } else {
+      as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow);
+    }
+  }
+}
+
+/* Fuse array/hash/upvalue reference into memory operand.
+** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
+** pass the final allow mask, excluding any GPRs used for other inputs.
+** In particular: 2-operand GPR instructions need to call ra_dest() first!
+*/
+static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  if (ra_noreg(ir->r)) {
+    switch ((IROp)ir->o) {
+    case IR_AREF:
+      if (mayfuse(as, ref)) {
+	asm_fusearef(as, ir, allow);
+	return;
+      }
+      break;
+    case IR_HREFK:
+      if (mayfuse(as, ref)) {
+	as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
+	as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
+	as->mrm.idx = RID_NONE;
+	return;
+      }
+      break;
+    case IR_UREFC:
+      if (irref_isk(ir->op1)) {
+	GCfunc *fn = ir_kfunc(IR(ir->op1));
+	GCupval *uv = &gcref(fn->l.uvptr[ir->op2])->uv;
+	as->mrm.ofs = ptr2addr(&uv->tv);
+	as->mrm.base = as->mrm.idx = RID_NONE;
+	return;
+      }
+      break;
+    default:
+      lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO);
+      break;
+    }
+  }
+  as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
+  as->mrm.ofs = 0;
+  as->mrm.idx = RID_NONE;
+}
+
+/* Fuse FLOAD/FREF reference into memory operand. */
+static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
+{
+  lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF);
+  as->mrm.ofs = field_ofs[ir->op2];
+  as->mrm.idx = RID_NONE;
+  if (irref_isk(ir->op1)) {
+    as->mrm.ofs += IR(ir->op1)->i;
+    as->mrm.base = RID_NONE;
+  } else {
+    as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
+  }
+}
+
+/* Fuse string reference into memory operand. */
+static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
+{
+  IRIns *irr;
+  lua_assert(ir->o == IR_STRREF);
+  as->mrm.idx = as->mrm.base = RID_NONE;
+  as->mrm.scale = XM_SCALE1;
+  as->mrm.ofs = sizeof(GCstr);
+  if (irref_isk(ir->op1)) {
+    as->mrm.ofs += IR(ir->op1)->i;
+  } else {
+    Reg r = ra_alloc1(as, ir->op1, allow);
+    rset_clear(allow, r);
+    as->mrm.base = (uint8_t)r;
+  }
+  irr = IR(ir->op2);
+  if (irref_isk(ir->op2)) {
+    as->mrm.ofs += irr->i;
+  } else {
+    Reg r;
+    /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
+    if (mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) {
+      as->mrm.ofs += IR(irr->op2)->i;
+      r = ra_alloc1(as, irr->op1, allow);
+    } else {
+      r = ra_alloc1(as, ir->op2, allow);
+    }
+    if (as->mrm.base == RID_NONE)
+      as->mrm.base = (uint8_t)r;
+    else
+      as->mrm.idx = (uint8_t)r;
+  }
+}
+
+/* Fuse load into memory operand. */
+static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  if (ra_hasreg(ir->r)) {
+    if (allow != RSET_EMPTY) return ir->r;  /* Fast path. */
+  fusespill:
+    /* Force a spill if only memory operands are allowed (asm_x87load). */
+    as->mrm.base = RID_ESP;
+    as->mrm.ofs = ra_spill(as, ir);
+    as->mrm.idx = RID_NONE;
+    return RID_MRM;
+  }
+  if (ir->o == IR_KNUM) {
+    lua_assert(allow != RSET_EMPTY);
+    if (!(as->freeset & ~as->modset & RSET_FPR)) {
+      as->mrm.ofs = ptr2addr(ir_knum(ir));
+      as->mrm.base = as->mrm.idx = RID_NONE;
+      return RID_MRM;
+    }
+  } else if (mayfuse(as, ref)) {
+    RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
+    if (ir->o == IR_SLOAD) {
+      if (!irt_isint(ir->t) && !(ir->op2 & IRSLOAD_PARENT)) {
+	as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
+	as->mrm.ofs = 8*((int32_t)ir->op1-1);
+	as->mrm.idx = RID_NONE;
+	return RID_MRM;
+      }
+    } else if (ir->o == IR_FLOAD) {
+      /* Generic fusion is only ok for IRT_INT operand (but see asm_comp). */
+      if (irt_isint(ir->t) && noconflict(as, ref, IR_FSTORE)) {
+	asm_fusefref(as, ir, xallow);
+	return RID_MRM;
+      }
+    } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
+      if (noconflict(as, ref, ir->o + IRDELTA_L2S)) {
+	asm_fuseahuref(as, ir->op1, xallow);
+	return RID_MRM;
+      }
+    } else if (ir->o == IR_XLOAD) {
+      /* Generic fusion is only ok for IRT_INT operand (but see asm_comp).
+      ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
+      */
+      if (irt_isint(ir->t)) {
+	asm_fusestrref(as, IR(ir->op1), xallow);
+	return RID_MRM;
+      }
+    }
+  }
+  if (!(as->freeset & allow) &&
+      (allow == RSET_EMPTY || ra_hasspill(ir->s) || ref < as->loopref))
+    goto fusespill;
+  return ra_allocref(as, ref, allow);
+}
+
+/* -- Type conversions ---------------------------------------------------- */
+
+static void asm_tonum(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg left = asm_fuseload(as, ir->op1, RSET_GPR);
+  emit_mrm(as, XO_CVTSI2SD, dest, left);
+  if (!(as->flags & JIT_F_SPLIT_XMM))
+    emit_rr(as, XO_XORPS, dest, dest);  /* Avoid partial register stall. */
+}
+
+static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+{
+  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_guardcc(as, CC_P);
+  asm_guardcc(as, CC_NE);
+  emit_rr(as, XO_UCOMISD, left, tmp);
+  emit_rr(as, XO_CVTSI2SD, tmp, dest);
+  if (!(as->flags & JIT_F_SPLIT_XMM))
+    emit_rr(as, XO_XORPS, tmp, tmp);  /* Avoid partial register stall. */
+  emit_rr(as, XO_CVTTSD2SI, dest, left);
+  /* Can't fuse since left is needed twice. */
+}
+
+static void asm_toint(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
+  emit_mrm(as, XO_CVTSD2SI, dest, left);
+}
+
+static void asm_tobit(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg tmp = ra_noreg(IR(ir->op1)->r) ?
+	      ra_alloc1(as, ir->op1, RSET_FPR) :
+	      ra_scratch(as, RSET_FPR);
+  Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
+  emit_rr(as, XO_MOVDto, tmp, dest);
+  emit_mrm(as, XO_ADDSD, tmp, right);
+  ra_left(as, tmp, ir->op1);
+}
+
+static void asm_strto(ASMState *as, IRIns *ir)
+{
+  Reg str;
+  int32_t ofs;
+  RegSet drop = RSET_SCRATCH;
+  /* Force a spill slot for the destination register (if any). */
+  if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r))
+    rset_set(drop, ir->r);  /* WIN64 doesn't spill all FPRs. */
+  ra_evictset(as, drop);
+  asm_guardcc(as, CC_E);
+  emit_rr(as, XO_TEST, RID_RET, RID_RET);
+  /* int lj_str_numconv(const char *s, TValue *n) */
+  emit_call(as, lj_str_numconv);
+  ofs = sps_scale(ir->s);  /* Use spill slot or slots SPS_TEMP1/2. */
+  if (ofs == 0) {
+    emit_setargr(as, 2, RID_ESP);
+  } else {
+    emit_setargr(as, 2, RID_RET);
+    emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ofs);
+  }
+  emit_setargr(as, 1, RID_RET);
+  str = ra_alloc1(as, ir->op1, RSET_GPR);
+  emit_rmro(as, XO_LEA, RID_RET, str, sizeof(GCstr));
+}
+
+static void asm_tostr(ASMState *as, IRIns *ir)
+{
+  IRIns *irl = IR(ir->op1);
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  as->gcsteps++;
+  if (irt_isnum(irl->t)) {
+    /* GCstr *lj_str_fromnum(lua_State *L, const lua_Number *np) */
+    emit_call(as, lj_str_fromnum);
+    emit_setargr(as, 1, RID_RET);
+    emit_getgl(as, RID_RET, jit_L);
+    emit_setargr(as, 2, RID_RET);
+    emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ra_spill(as, irl));
+  } else {
+    /* GCstr *lj_str_fromint(lua_State *L, int32_t k) */
+    emit_call(as, lj_str_fromint);
+    emit_setargr(as, 1, RID_RET);
+    emit_getgl(as, RID_RET, jit_L);
+    emit_setargr(as, 2, ra_alloc1(as, ir->op1, RSET_GPR));
+  }
+}
+
+/* -- Memory references --------------------------------------------------- */
+
+static void asm_aref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_fusearef(as, ir, RSET_GPR);
+  if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
+    emit_mrm(as, XO_LEA, dest, RID_MRM);
+  else if (as->mrm.base != dest)
+    emit_rr(as, XO_MOV, dest, as->mrm.base);
+}
+
+/* Must match with hashkey() and hashrot() in lj_tab.c. */
+static uint32_t ir_khash(IRIns *ir)
+{
+  uint32_t lo, hi;
+  if (irt_isstr(ir->t)) {
+    return ir_kstr(ir)->hash;
+  } else if (irt_isnum(ir->t)) {
+    lo = ir_knum(ir)->u32.lo;
+    hi = ir_knum(ir)->u32.hi & 0x7fffffff;
+  } else if (irt_ispri(ir->t)) {
+    lua_assert(!irt_isnil(ir->t));
+    return irt_type(ir->t)-IRT_FALSE;
+  } else {
+    lua_assert(irt_isaddr(ir->t));
+    lo = u32ptr(ir_kgc(ir));
+    hi = lo - 0x04c11db7;
+  }
+  lo ^= hi; hi = lj_rol(hi, 14);
+  lo -= hi; hi = lj_rol(hi, 5);
+  hi ^= lo; hi -= lj_rol(lo, 27);
+  return hi;
+}
+
+/* Merge NE(HREF, niltv) check. */
+static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
+{
+  /* Assumes nothing else generates NE of HREF. */
+  if (ir[1].o == IR_NE && ir[1].op1 == as->curins) {
+    if (LJ_64 && *as->mcp != XI_ARITHi)
+      as->mcp += 7+6;
+    else
+      as->mcp += 6+6;  /* Kill cmp reg, imm32 + jz exit. */
+    return as->mcp + *(int32_t *)(as->mcp-4);  /* Return exit address. */
+  }
+  return NULL;
+}
+
+/* Inlined hash lookup. Specialized for key type and for const keys.
+** The equivalent C code is:
+**   Node *n = hashkey(t, key);
+**   do {
+**     if (lj_obj_equal(&n->key, key)) return &n->val;
+**   } while ((n = nextnode(n)));
+**   return niltv(L);
+*/
+static void asm_href(ASMState *as, IRIns *ir)
+{
+  MCode *nilexit = merge_href_niltv(as, ir);  /* Do this before any restores. */
+  RegSet allow = RSET_GPR;
+  Reg dest = ra_dest(as, ir, allow);
+  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
+  Reg key = RID_NONE, tmp = RID_NONE;
+  IRIns *irkey = IR(ir->op2);
+  int isk = irref_isk(ir->op2);
+  IRType1 kt = irkey->t;
+  uint32_t khash;
+  MCLabel l_end, l_loop, l_next;
+
+  if (!isk) {
+    rset_clear(allow, tab);
+    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
+    if (!irt_isstr(kt))
+      tmp = ra_scratch(as, rset_exclude(allow, key));
+  }
+
+  /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
+  l_end = emit_label(as);
+  if (nilexit)
+    emit_jcc(as, CC_E, nilexit);  /* XI_JMP is not found by lj_asm_patchexit. */
+  else
+    emit_loada(as, dest, niltvg(J2G(as->J)));
+
+  /* Follow hash chain until the end. */
+  l_loop = emit_sjcc_label(as, CC_NZ);
+  emit_rr(as, XO_TEST, dest, dest);
+  emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next));
+  l_next = emit_label(as);
+
+  /* Type and value comparison. */
+  emit_sjcc(as, CC_E, l_end);
+  if (irt_isnum(kt)) {
+    if (isk) {
+      /* Assumes -0.0 is already canonicalized to +0.0. */
+      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
+		 (int32_t)ir_knum(irkey)->u32.lo);
+      emit_sjcc(as, CC_NE, l_next);
+      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
+		 (int32_t)ir_knum(irkey)->u32.hi);
+    } else {
+      emit_sjcc(as, CC_P, l_next);
+      emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
+      emit_sjcc(as, CC_A, l_next);
+      /* The type check avoids NaN penalties and complaints from Valgrind. */
+      emit_i8(as, ~IRT_NUM);
+      emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
+    }
+  } else {
+    if (!irt_ispri(kt)) {
+      lua_assert(irt_isaddr(kt));
+      if (isk)
+	emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr),
+		   ptr2addr(ir_kgc(irkey)));
+      else
+	emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr));
+      emit_sjcc(as, CC_NE, l_next);
+    }
+    lua_assert(!irt_isnil(kt));
+    emit_i8(as, ~irt_type(kt));
+    emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
+  }
+  emit_sfixup(as, l_loop);
+  checkmclim(as);
+
+  /* Load main position relative to tab->node into dest. */
+  khash = isk ? ir_khash(irkey) : 1;
+  if (khash == 0) {
+    emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node));
+  } else {
+    emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node));
+    if ((as->flags & JIT_F_PREFER_IMUL)) {
+      emit_i8(as, sizeof(Node));
+      emit_rr(as, XO_IMULi8, dest, dest);
+    } else {
+      emit_shifti(as, XOg_SHL, dest, 3);
+      emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
+    }
+    if (isk) {
+      emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
+      emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
+    } else if (irt_isstr(kt)) {
+      emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash));
+      emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
+    } else {  /* Must match with hashrot() in lj_tab.c. */
+      emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask));
+      emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
+      emit_shifti(as, XOg_ROL, tmp, 27);
+      emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
+      emit_shifti(as, XOg_ROL, dest, 5);
+      emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
+      emit_shifti(as, XOg_ROL, dest, 14);
+      emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest);
+      if (irt_isnum(kt)) {
+	emit_rmro(as, XO_ARITH(XOg_AND), dest, RID_ESP, ra_spill(as, irkey)+4);
+	emit_loadi(as, dest, 0x7fffffff);
+	emit_rr(as, XO_MOVDto, key, tmp);
+      } else {
+	emit_rr(as, XO_MOV, tmp, key);
+	emit_rmro(as, XO_LEA, dest, key, -0x04c11db7);
+      }
+    }
+  }
+}
+
+static void asm_hrefk(ASMState *as, IRIns *ir)
+{
+  IRIns *kslot = IR(ir->op2);
+  IRIns *irkey = IR(kslot->op1);
+  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
+  Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
+  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
+  MCLabel l_exit;
+  lua_assert(ofs % sizeof(Node) == 0);
+  if (ra_hasreg(dest)) {
+    if (ofs != 0) {
+      if (dest == node && !(as->flags & JIT_F_LEA_AGU))
+	emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs);
+      else
+	emit_rmro(as, XO_LEA, dest, node, ofs);
+    } else if (dest != node) {
+      emit_rr(as, XO_MOV, dest, node);
+    }
+  }
+  asm_guardcc(as, CC_NE);
+  l_exit = emit_label(as);
+  if (irt_isnum(irkey->t)) {
+    /* Assumes -0.0 is already canonicalized to +0.0. */
+    emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
+	       ofs + (int32_t)offsetof(Node, key.u32.lo),
+	       (int32_t)ir_knum(irkey)->u32.lo);
+    emit_sjcc(as, CC_NE, l_exit);
+    emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
+	       ofs + (int32_t)offsetof(Node, key.u32.hi),
+	       (int32_t)ir_knum(irkey)->u32.hi);
+  } else {
+    if (!irt_ispri(irkey->t)) {
+      lua_assert(irt_isgcv(irkey->t));
+      emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
+		 ofs + (int32_t)offsetof(Node, key.gcr),
+		 ptr2addr(ir_kgc(irkey)));
+      emit_sjcc(as, CC_NE, l_exit);
+    }
+    lua_assert(!irt_isnil(irkey->t));
+    emit_i8(as, ~irt_type(irkey->t));
+    emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
+	      ofs + (int32_t)offsetof(Node, key.it));
+  }
+}
+
+static void asm_newref(ASMState *as, IRIns *ir)
+{
+  IRRef keyref = ir->op2;
+  IRIns *irkey = IR(keyref);
+  RegSet allow = RSET_GPR;
+  Reg tab, tmp;
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  tab = ra_alloc1(as, ir->op1, allow);
+  tmp = ra_scratch(as, rset_clear(allow, tab));
+  /* TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key) */
+  emit_call(as, lj_tab_newkey);
+  emit_setargr(as, 1, tmp);
+  emit_setargr(as, 2, tab);
+  emit_getgl(as, tmp, jit_L);
+  if (irt_isnum(irkey->t)) {
+    /* For numbers use the constant itself or a spill slot as a TValue. */
+    if (irref_isk(keyref)) {
+      emit_setargp(as, 3, ir_knum(irkey));
+    } else {
+      emit_setargr(as, 3, tmp);
+      emit_rmro(as, XO_LEA, tmp, RID_ESP, ra_spill(as, irkey));
+    }
+  } else {
+    /* Otherwise use g->tmptv to hold the TValue. */
+    lua_assert(irt_ispri(irkey->t) || irt_isaddr(irkey->t));
+    emit_setargr(as, 3, tmp);
+    if (!irref_isk(keyref)) {
+      Reg src = ra_alloc1(as, keyref, rset_exclude(allow, tmp));
+      emit_movtomro(as, src, tmp, 0);
+    } else if (!irt_ispri(irkey->t)) {
+      emit_movmroi(as, tmp, 0, irkey->i);
+    }
+    emit_movmroi(as, tmp, 4, irt_toitype(irkey->t));
+    emit_loada(as, tmp, &J2G(as->J)->tmptv);
+  }
+}
+
+static void asm_uref(ASMState *as, IRIns *ir)
+{
+  /* NYI: Check that UREFO is still open and not aliasing a slot. */
+  if (ra_used(ir)) {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    if (irref_isk(ir->op1)) {
+      GCfunc *fn = ir_kfunc(IR(ir->op1));
+      TValue **v = &gcref(fn->l.uvptr[ir->op2])->uv.v;
+      emit_rma(as, XO_MOV, dest, v);
+    } else {
+      Reg uv = ra_scratch(as, RSET_GPR);
+      Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+      if (ir->o == IR_UREFC) {
+	emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv));
+	asm_guardcc(as, CC_NE);
+	emit_i8(as, 1);
+	emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
+      } else {
+	emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v));
+      }
+      emit_rmro(as, XO_MOV, uv, func,
+		(int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)ir->op2);
+    }
+  }
+}
+
+static void asm_fref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_fusefref(as, ir, RSET_GPR);
+  emit_mrm(as, XO_LEA, dest, RID_MRM);
+}
+
+static void asm_strref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_fusestrref(as, ir, RSET_GPR);
+  if (as->mrm.base == RID_NONE)
+    emit_loadi(as, dest, as->mrm.ofs);
+  else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
+    emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs);
+  else
+    emit_mrm(as, XO_LEA, dest, RID_MRM);
+}
+
+/* -- Loads and stores ---------------------------------------------------- */
+
+static void asm_fload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  x86Op xo;
+  asm_fusefref(as, ir, RSET_GPR);
+  switch (irt_type(ir->t)) {
+  case IRT_I8: xo = XO_MOVSXb; break;
+  case IRT_U8: xo = XO_MOVZXb; break;
+  case IRT_I16: xo = XO_MOVSXw; break;
+  case IRT_U16: xo = XO_MOVZXw; break;
+  default:
+    lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
+    xo = XO_MOV;
+    break;
+  }
+  emit_mrm(as, xo, dest, RID_MRM);
+}
+
+static void asm_fstore(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg src = RID_NONE;
+  /* The IRT_I16/IRT_U16 stores should never be simplified for constant
+  ** values since mov word [mem], imm16 has a length-changing prefix.
+  */
+  if (!irref_isk(ir->op2) || irt_isi16(ir->t) || irt_isu16(ir->t)) {
+    RegSet allow8 = (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR;
+    src = ra_alloc1(as, ir->op2, allow8);
+    rset_clear(allow, src);
+  }
+  asm_fusefref(as, IR(ir->op1), allow);
+  if (ra_hasreg(src)) {
+    x86Op xo;
+    switch (irt_type(ir->t)) {
+    case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break;
+    case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
+    default:
+      lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
+      xo = XO_MOVto;
+      break;
+    }
+    emit_mrm(as, xo, src, RID_MRM);
+  } else {
+    if (irt_isi8(ir->t) || irt_isu8(ir->t)) {
+      emit_i8(as, IR(ir->op2)->i);
+      emit_mrm(as, XO_MOVmib, 0, RID_MRM);
+    } else {
+      lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
+      emit_i32(as, IR(ir->op2)->i);
+      emit_mrm(as, XO_MOVmi, 0, RID_MRM);
+    }
+  }
+}
+
+static void asm_ahuload(ASMState *as, IRIns *ir)
+{
+  RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t));
+  if (ra_used(ir)) {
+    Reg dest = ra_dest(as, ir, allow);
+    asm_fuseahuref(as, ir->op1, RSET_GPR);
+    emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM);
+  } else {
+    asm_fuseahuref(as, ir->op1, RSET_GPR);
+  }
+  /* Always do the type check, even if the load result is unused. */
+  asm_guardcc(as, irt_isnum(ir->t) ? CC_A : CC_NE);
+  emit_i8(as, ~irt_type(ir->t));
+  as->mrm.ofs += 4;
+  emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
+}
+
+static void asm_ahustore(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
+    asm_fuseahuref(as, ir->op1, RSET_GPR);
+    emit_mrm(as, XO_MOVSDto, src, RID_MRM);
+  } else {
+    IRIns *irr = IR(ir->op2);
+    RegSet allow = RSET_GPR;
+    Reg src = RID_NONE;
+    if (!irref_isk(ir->op2)) {
+      src = ra_alloc1(as, ir->op2, allow);
+      rset_clear(allow, src);
+    }
+    asm_fuseahuref(as, ir->op1, allow);
+    if (ra_hasreg(src)) {
+      emit_mrm(as, XO_MOVto, src, RID_MRM);
+    } else if (!irt_ispri(irr->t)) {
+      lua_assert(irt_isaddr(ir->t));
+      emit_i32(as, irr->i);
+      emit_mrm(as, XO_MOVmi, 0, RID_MRM);
+    }
+    as->mrm.ofs += 4;
+    emit_i32(as, (int32_t)~irt_type(ir->t));
+    emit_mrm(as, XO_MOVmi, 0, RID_MRM);
+  }
+}
+
+static void asm_sload(ASMState *as, IRIns *ir)
+{
+  int32_t ofs = 8*((int32_t)ir->op1-1);
+  IRType1 t = ir->t;
+  Reg base;
+  lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
+  if (irt_isint(t)) {
+    Reg left = ra_scratch(as, RSET_FPR);
+    asm_tointg(as, ir, left);  /* Frees dest reg. Do this before base alloc. */
+    base = ra_alloc1(as, REF_BASE, RSET_GPR);
+    emit_rmro(as, XMM_MOVRM(as), left, base, ofs);
+    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
+  } else if (ra_used(ir)) {
+    RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+    Reg dest = ra_dest(as, ir, allow);
+    lua_assert(irt_isnum(ir->t) || irt_isaddr(ir->t));
+    base = ra_alloc1(as, REF_BASE, RSET_GPR);
+    emit_movrmro(as, dest, base, ofs);
+  } else {
+    if (!irt_isguard(ir->t))
+      return;  /* No type check: avoid base alloc. */
+    base = ra_alloc1(as, REF_BASE, RSET_GPR);
+  }
+  if (irt_isguard(ir->t)) {
+    /* Need type check, even if the load result is unused. */
+    asm_guardcc(as, irt_isnum(t) ? CC_A : CC_NE);
+    emit_i8(as, ~irt_type(t));
+    emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
+  }
+}
+
+static void asm_xload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  x86Op xo;
+  asm_fusestrref(as, IR(ir->op1), RSET_GPR);  /* For now only support STRREF. */
+  /* ir->op2 is ignored -- unaligned loads are ok on x86. */
+  switch (irt_type(ir->t)) {
+  case IRT_I8: xo = XO_MOVSXb; break;
+  case IRT_U8: xo = XO_MOVZXb; break;
+  case IRT_I16: xo = XO_MOVSXw; break;
+  case IRT_U16: xo = XO_MOVZXw; break;
+  default: lua_assert(irt_isint(ir->t)); xo = XO_MOV; break;
+  }
+  emit_mrm(as, xo, dest, RID_MRM);
+}
+
+/* -- String ops ---------------------------------------------------------- */
+
+static void asm_snew(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg left, right;
+  IRIns *irl;
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  irl = IR(ir->op1);
+  left = irl->r;
+  right = IR(ir->op2)->r;
+  if (ra_noreg(left)) {
+    lua_assert(irl->o == IR_STRREF);
+    /* Get register only for non-const STRREF. */
+    if (!(irref_isk(irl->op1) && irref_isk(irl->op2))) {
+      if (ra_hasreg(right)) rset_clear(allow, right);
+      left = ra_allocref(as, ir->op1, allow);
+    }
+  }
+  if (ra_noreg(right) && !irref_isk(ir->op2)) {
+    if (ra_hasreg(left)) rset_clear(allow, left);
+    right = ra_allocref(as, ir->op2, allow);
+  }
+  /* GCstr *lj_str_new(lua_State *L, const char *str, size_t len) */
+  emit_call(as, lj_str_new);
+  emit_setargr(as, 1, RID_RET);
+  emit_getgl(as, RID_RET, jit_L);
+  if (ra_noreg(left))  /* Use immediate for const STRREF. */
+    emit_setargi(as, 2, IR(irl->op1)->i + IR(irl->op2)->i +
+			(int32_t)sizeof(GCstr));
+  else
+    emit_setargr(as, 2, left);
+  if (ra_noreg(right))
+    emit_setargi(as, 3, IR(ir->op2)->i);
+  else
+    emit_setargr(as, 3, right);
+  as->gcsteps++;
+}
+
+/* -- Table ops ----------------------------------------------------------- */
+
+static void asm_tnew(ASMState *as, IRIns *ir)
+{
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  /* GCtab *lj_tab_new(lua_State *L, int32_t asize, uint32_t hbits) */
+  emit_call(as, lj_tab_new);
+  emit_setargr(as, 1, RID_RET);
+  emit_setargi(as, 2, ir->op1);
+  emit_setargi(as, 3, ir->op2);
+  emit_getgl(as, RID_RET, jit_L);
+  as->gcsteps++;
+}
+
+static void asm_tdup(ASMState *as, IRIns *ir)
+{
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  /* GCtab *lj_tab_dup(lua_State *L, const GCtab *kt) */
+  emit_call(as, lj_tab_dup);
+  emit_setargr(as, 1, RID_RET);
+  emit_setargp(as, 2, ir_kgc(IR(ir->op1)));
+  emit_getgl(as, RID_RET, jit_L);
+  as->gcsteps++;
+}
+
+static void asm_tlen(ASMState *as, IRIns *ir)
+{
+  ra_destreg(as, ir, RID_RET);
+  ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
+  emit_call(as, lj_tab_len);  /* MSize lj_tab_len(GCtab *t) */
+  emit_setargr(as, 1, ra_alloc1(as, ir->op1, RSET_GPR));
+}
+
+static void asm_tbar(ASMState *as, IRIns *ir)
+{
+  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
+  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
+  MCLabel l_end = emit_label(as);
+  emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist));
+  emit_setgl(as, tab, gc.grayagain);
+  emit_getgl(as, tmp, gc.grayagain);
+  emit_i8(as, ~LJ_GC_BLACK);
+  emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked));
+  emit_sjcc(as, CC_Z, l_end);
+  emit_i8(as, LJ_GC_BLACK);
+  emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked));
+}
+
+static void asm_obar(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg obj, val;
+  GCobj *valp;
+  MCLabel l_end;
+  int32_t ofs;
+  ra_evictset(as, RSET_SCRATCH);
+  if (irref_isk(ir->op2)) {
+    valp = ir_kgc(IR(ir->op2));
+    val = RID_NONE;
+  } else {
+    valp = NULL;
+    val = ra_alloc1(as, ir->op2, allow);
+    rset_clear(allow, val);
+  }
+  obj = ra_alloc1(as, ir->op1, allow);
+  l_end = emit_label(as);
+  /* No need for other object barriers (yet). */
+  lua_assert(IR(ir->op1)->o == IR_UREFC);
+  ofs = -(int32_t)offsetof(GCupval, tv);
+  /* void lj_gc_barrieruv(global_State *g, GCobj *o, GCobj *v) */
+  emit_call(as, lj_gc_barrieruv);
+  if (ofs == 0) {
+    emit_setargr(as, 2, obj);
+  } else if (rset_test(RSET_SCRATCH, obj) && !(as->flags & JIT_F_LEA_AGU)) {
+    emit_setargr(as, 2, obj);
+    emit_gri(as, XG_ARITHi(XOg_ADD), obj, ofs);
+  } else {
+    emit_setargr(as, 2, RID_RET);
+    emit_rmro(as, XO_LEA, RID_RET, obj, ofs);
+  }
+  emit_setargp(as, 1, J2G(as->J));
+  if (valp)
+    emit_setargp(as, 3, valp);
+  else
+    emit_setargr(as, 3, val);
+  emit_sjcc(as, CC_Z, l_end);
+  emit_i8(as, LJ_GC_WHITES);
+  if (valp)
+    emit_rma(as, XO_GROUP3b, XOg_TEST, &valp->gch.marked);
+  else
+    emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked));
+  emit_sjcc(as, CC_Z, l_end);
+  emit_i8(as, LJ_GC_BLACK);
+  emit_rmro(as, XO_GROUP3b, XOg_TEST, obj,
+	    ofs + (int32_t)offsetof(GChead, marked));
+}
+
+/* -- FP/int arithmetic and logic operations ------------------------------ */
+
+/* Load reference onto x87 stack. Force a spill to memory if needed. */
+static void asm_x87load(ASMState *as, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (ir->o == IR_KNUM) {
+    cTValue *tv = ir_knum(ir);
+    if (tvispzero(tv))  /* Use fldz only for +0. */
+      emit_x87op(as, XI_FLDZ);
+    else if (tvispone(tv))
+      emit_x87op(as, XI_FLD1);
+    else
+      emit_rma(as, XO_FLDq, XOg_FLDq, tv);
+  } else if (ir->o == IR_TONUM && !ra_used(ir) &&
+	     !irref_isk(ir->op1) && mayfuse(as, ir->op1)) {
+    IRIns *iri = IR(ir->op1);
+    emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri));
+  } else {
+    emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY));
+  }
+}
+
+/* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
+static int fpmjoin_pow(ASMState *as, IRIns *ir)
+{
+  IRIns *irp = IR(ir->op1);
+  if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) {
+    IRIns *irpp = IR(irp->op1);
+    if (irpp == ir-2 && irpp->o == IR_FPMATH &&
+	irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
+      emit_call(as, lj_vm_pow);  /* st0 = lj_vm_pow(st1, st0) */
+      asm_x87load(as, irp->op2);
+      asm_x87load(as, irpp->op1);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void asm_fpmath(ASMState *as, IRIns *ir)
+{
+  IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER;
+  if (fpm == IRFPM_SQRT) {
+    Reg dest = ra_dest(as, ir, RSET_FPR);
+    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
+    emit_mrm(as, XO_SQRTSD, dest, left);
+  } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) {
+    Reg dest = ra_dest(as, ir, RSET_FPR);
+    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
+    /* Round down/up/trunc == 1001/1010/1011. */
+    emit_i8(as, 0x09 + fpm);
+    /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */
+    emit_mrm(as, XO_ROUNDSD, dest, left);
+    /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */
+    /* This is atrocious, but the alternatives are much worse. */
+    if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
+      as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
+    }
+    *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
+  } else {
+    int32_t ofs = sps_scale(ir->s);  /* Use spill slot or slots SPS_TEMP1/2. */
+    Reg dest = ir->r;
+    if (ra_hasreg(dest)) {
+      ra_free(as, dest);
+      ra_modified(as, dest);
+      emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs);
+    }
+    emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
+    switch (fpm) {  /* st0 = lj_vm_*(st0) */
+    case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break;
+    case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break;
+    case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break;
+    case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
+    case IRFPM_EXP2:
+      if (fpmjoin_pow(as, ir)) return;
+      emit_call(as, lj_vm_exp2);  /* st0 = lj_vm_exp2(st0) */
+      break;
+    case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
+    case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
+    case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
+    case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
+      /* Note: the use of fyl2xp1 would be pointless here. When computing
+      ** log(1.0+eps) the precision is already lost after 1.0 is added.
+      ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
+      */
+      emit_x87op(as, XI_FYL2X); break;
+    case IRFPM_OTHER:
+      switch (ir->o) {
+      case IR_ATAN2:
+	emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
+      case IR_LDEXP:
+	emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
+      case IR_POWI:
+	emit_call(as, lj_vm_powi);  /* st0 = lj_vm_powi(st0, [esp]) */
+	emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0);
+	break;
+      default: lua_assert(0); break;
+      }
+      break;
+    default: lua_assert(0); break;
+    }
+    asm_x87load(as, ir->op1);
+    switch (fpm) {
+    case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
+    case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
+    case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
+    case IRFPM_OTHER:
+      if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
+      break;
+    default: break;
+    }
+  }
+}
+
+/* Find out whether swapping operands might be beneficial. */
+static int swapops(ASMState *as, IRIns *ir)
+{
+  IRIns *irl = IR(ir->op1);
+  IRIns *irr = IR(ir->op2);
+  lua_assert(ra_noreg(irr->r));
+  if (!irm_iscomm(lj_ir_mode[ir->o]))
+    return 0;  /* Can't swap non-commutative operations. */
+  if (irref_isk(ir->op2))
+    return 0;  /* Don't swap constants to the left. */
+  if (ra_hasreg(irl->r))
+    return 1;  /* Swap if left already has a register. */
+  if (ra_samehint(ir->r, irr->r))
+    return 1;  /* Swap if dest and right have matching hints. */
+  if (ir->op1 < as->loopref && !irt_isphi(irl->t) &&
+      !(ir->op2 < as->loopref && !irt_isphi(irr->t)))
+    return 1;  /* Swap invariants to the right. */
+  if (opisfusableload(irl->o))
+    return 1;  /* Swap fusable loads to the right. */
+  return 0;  /* Otherwise don't swap. */
+}
+
+static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo)
+{
+  IRRef lref = ir->op1;
+  IRRef rref = ir->op2;
+  RegSet allow = RSET_FPR;
+  Reg dest;
+  Reg right = IR(rref)->r;
+  if (ra_hasreg(right))
+    rset_clear(allow, right);
+  dest = ra_dest(as, ir, allow);
+  if (lref == rref) {
+    right = dest;
+  } else if (ra_noreg(right)) {
+    if (swapops(as, ir)) {
+      IRRef tmp = lref; lref = rref; rref = tmp;
+    }
+    right = asm_fuseload(as, rref, rset_clear(allow, dest));
+  }
+  emit_mrm(as, xo, dest, right);
+  ra_left(as, dest, lref);
+}
+
+static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa)
+{
+  IRRef lref = ir->op1;
+  IRRef rref = ir->op2;
+  RegSet allow = RSET_GPR;
+  Reg dest, right;
+  if (as->testmcp == as->mcp) {  /* Drop test r,r instruction. */
+    as->testmcp = NULL;
+    as->mcp += (LJ_64 && *as->mcp != XI_TEST) ? 3 : 2;
+  }
+  right = IR(rref)->r;
+  if (ra_hasreg(right))
+    rset_clear(allow, right);
+  dest = ra_dest(as, ir, allow);
+  if (lref == rref) {
+    right = dest;
+  } else if (ra_noreg(right) && !irref_isk(rref)) {
+    if (swapops(as, ir)) {
+      IRRef tmp = lref; lref = rref; rref = tmp;
+    }
+    right = asm_fuseload(as, rref, rset_clear(allow, dest));
+    /* Note: fuses only with IR_FLOAD for now. */
+  }
+  if (irt_isguard(ir->t))  /* For IR_ADDOV etc. */
+    asm_guardcc(as, CC_O);
+  if (ra_hasreg(right))
+    emit_mrm(as, XO_ARITH(xa), dest, right);
+  else
+    emit_gri(as, XG_ARITHi(xa), dest, IR(ir->op2)->i);
+  ra_left(as, dest, lref);
+}
+
+/* LEA is really a 4-operand ADD with an independent destination register,
+** up to two source registers and an immediate. One register can be scaled
+** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
+** instructions.
+**
+** Currently only a few common cases are supported:
+** - 3-operand ADD:    y = a+b; y = a+k   with a and b already allocated
+** - Left ADD fusion:  y = (a+b)+k; y = (a+k)+b
+** - Right ADD fusion: y = a+(b+k)
+** The ommited variants have already been reduced by FOLD.
+**
+** There are more fusion opportunities, like gathering shifts or joining
+** common references. But these are probably not worth the trouble, since
+** array indexing is not decomposed and already makes use of all fields
+** of the ModRM operand.
+*/
+static int asm_lea(ASMState *as, IRIns *ir)
+{
+  IRIns *irl = IR(ir->op1);
+  IRIns *irr = IR(ir->op2);
+  RegSet allow = RSET_GPR;
+  Reg dest;
+  as->mrm.base = as->mrm.idx = RID_NONE;
+  as->mrm.scale = XM_SCALE1;
+  as->mrm.ofs = 0;
+  if (ra_hasreg(irl->r)) {
+    rset_clear(allow, irl->r);
+    as->mrm.base = irl->r;
+    if (irref_isk(ir->op2) || ra_hasreg(irr->r)) {
+      /* The PHI renaming logic does a better job in some cases. */
+      if (ra_hasreg(ir->r) &&
+	  ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) ||
+	   (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2)))
+	return 0;
+      if (irref_isk(ir->op2)) {
+	as->mrm.ofs = irr->i;
+      } else {
+	rset_clear(allow, irr->r);
+	as->mrm.idx = irr->r;
+      }
+    } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) &&
+	       irref_isk(irr->op2)) {
+      Reg idx = ra_alloc1(as, irr->op1, allow);
+      rset_clear(allow, idx);
+      as->mrm.idx = (uint8_t)idx;
+      as->mrm.ofs = IR(irr->op2)->i;
+    } else {
+      return 0;
+    }
+  } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) &&
+	     (irref_isk(ir->op2) || irref_isk(irl->op2))) {
+    Reg idx, base = ra_alloc1(as, irl->op1, allow);
+    rset_clear(allow, base);
+    as->mrm.base = (uint8_t)base;
+    if (irref_isk(ir->op2)) {
+      as->mrm.ofs = irr->i;
+      idx = ra_alloc1(as, irl->op2, allow);
+    } else {
+      as->mrm.ofs = IR(irl->op2)->i;
+      idx = ra_alloc1(as, ir->op2, allow);
+    }
+    rset_clear(allow, idx);
+    as->mrm.idx = (uint8_t)idx;
+  } else {
+    return 0;
+  }
+  dest = ra_dest(as, ir, allow);
+  emit_mrm(as, XO_LEA, dest, RID_MRM);
+  return 1;  /* Success. */
+}
+
+static void asm_add(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t))
+    asm_fparith(as, ir, XO_ADDSD);
+  else if ((as->flags & JIT_F_LEA_AGU) || as->testmcp == as->mcp ||
+	   !asm_lea(as, ir))
+    asm_intarith(as, ir, XOg_ADD);
+}
+
+static void asm_bitnot(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  emit_rr(as, XO_GROUP3, XOg_NOT, dest);
+  ra_left(as, dest, ir->op1);
+}
+
+static void asm_bitswap(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  MCode *p = as->mcp;
+  p[-1] = (MCode)(XI_BSWAP+(dest&7));
+  p[-2] = 0x0f;
+  p -= 2;
+  REXRB(p, 0, dest);
+  as->mcp = p;
+  ra_left(as, dest, ir->op1);
+}
+
+static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
+{
+  IRRef rref = ir->op2;
+  IRIns *irr = IR(rref);
+  Reg dest;
+  if (irref_isk(rref)) {  /* Constant shifts. */
+    int shift;
+    dest = ra_dest(as, ir, RSET_GPR);
+    shift = irr->i & 31;  /* Handle shifts of 0..31 bits. */
+    switch (shift) {
+    case 0: return;
+    case 1: emit_rr(as, XO_SHIFT1, (Reg)xs, dest); break;
+    default: emit_shifti(as, xs, dest, shift); break;
+    }
+  } else {  /* Variable shifts implicitly use register cl (i.e. ecx). */
+    RegSet allow = rset_exclude(RSET_GPR, RID_ECX);
+    Reg right = irr->r;
+    if (ra_noreg(right)) {
+      right = ra_allocref(as, rref, RID2RSET(RID_ECX));
+    } else if (right != RID_ECX) {
+      rset_clear(allow, right);
+      ra_scratch(as, RID2RSET(RID_ECX));
+    }
+    dest = ra_dest(as, ir, allow);
+    emit_rr(as, XO_SHIFTcl, (Reg)xs, dest);
+    if (right != RID_ECX)
+      emit_rr(as, XO_MOV, RID_ECX, right);
+  }
+  ra_left(as, dest, ir->op1);
+  /*
+  ** Note: avoid using the flags resulting from a shift or rotate!
+  ** All of them cause a partial flag stall, except for r,1 shifts
+  ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
+  */
+}
+
+/* -- Comparisons --------------------------------------------------------- */
+
+/* Virtual flags for unordered FP comparisons. */
+#define VCC_U	0x100		/* Unordered. */
+#define VCC_P	0x200		/* Needs extra CC_P branch. */
+#define VCC_S	0x400		/* Swap avoids CC_P branch. */
+#define VCC_PS	(VCC_P|VCC_S)
+
+static void asm_comp_(ASMState *as, IRIns *ir, int cc)
+{
+  if (irt_isnum(ir->t)) {
+    IRRef lref = ir->op1;
+    IRRef rref = ir->op2;
+    Reg left, right;
+    MCLabel l_around;
+    /*
+    ** An extra CC_P branch is required to preserve ordered/unordered
+    ** semantics for FP comparisons. This can be avoided by swapping
+    ** the operands and inverting the condition (except for EQ and UNE).
+    ** So always try to swap if possible.
+    **
+    ** Another option would be to swap operands to achieve better memory
+    ** operand fusion. But it's unlikely that this outweighs the cost
+    ** of the extra branches.
+    */
+    if (cc & VCC_S) {  /* Swap? */
+      IRRef tmp = lref; lref = rref; rref = tmp;
+      cc ^= (VCC_PS|(5<<4));  /* A <-> B, AE <-> BE, PS <-> none */
+    }
+    left = ra_alloc1(as, lref, RSET_FPR);
+    right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
+    l_around = emit_label(as);
+    asm_guardcc(as, cc >> 4);
+    if (cc & VCC_P) {  /* Extra CC_P branch required? */
+      if (!(cc & VCC_U)) {
+	asm_guardcc(as, CC_P);  /* Branch to exit for ordered comparisons. */
+      } else if (l_around != as->invmcp) {
+	emit_sjcc(as, CC_P, l_around);  /* Branch around for unordered. */
+      } else {
+	/* Patched to mcloop by asm_loop_fixup. */
+	as->loopinv = 2;
+	if (as->realign)
+	  emit_sjcc(as, CC_P, as->mcp);
+	else
+	  emit_jcc(as, CC_P, as->mcp);
+      }
+    }
+    emit_mrm(as, XO_UCOMISD, left, right);
+  } else if (!(irt_isstr(ir->t) && (cc & 0xe) != CC_E)) {
+    IRRef lref = ir->op1, rref = ir->op2;
+    IROp leftop = (IROp)(IR(lref)->o);
+    lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
+    /* Swap constants (only for ABC) and fusable loads to the right. */
+    if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
+      if ((cc & 0xc) == 0xc) cc ^= 3;  /* L <-> G, LE <-> GE */
+      else if ((cc & 0xa) == 0x2) cc ^= 5;  /* A <-> B, AE <-> BE */
+      lref = ir->op2; rref = ir->op1;
+    }
+    if (irref_isk(rref)) {
+      IRIns *irl = IR(lref);
+      int32_t imm = IR(rref)->i;
+      /* Check wether we can use test ins. Not for unsigned, since CF=0. */
+      int usetest = (imm == 0 && (cc & 0xa) != 0x2);
+      if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) {
+	/* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
+	Reg right, left = RID_NONE;
+	RegSet allow = RSET_GPR;
+	if (!irref_isk(irl->op2)) {
+	  left = ra_alloc1(as, irl->op2, allow);
+	  rset_clear(allow, left);
+	}
+	right = asm_fuseload(as, irl->op1, allow);
+	asm_guardcc(as, cc);
+	if (irref_isk(irl->op2)) {
+	  emit_i32(as, IR(irl->op2)->i);
+	  emit_mrm(as, XO_GROUP3, XOg_TEST, right);
+	} else {
+	  emit_mrm(as, XO_TEST, left, right);
+	}
+      } else {
+	Reg left;
+	if (opisfusableload((IROp)irl->o) &&
+	    ((irt_isi8(irl->t) && checki8(imm)) ||
+	     (irt_isu8(irl->t) && checku8(imm)))) {
+	  /* Only the IRT_INT case is fused by asm_fuseload. The IRT_I8/IRT_U8
+	  ** loads are handled here. The IRT_I16/IRT_U16 loads should never be
+	  ** fused, since cmp word [mem], imm16 has a length-changing prefix.
+	  */
+	  IRType1 origt = irl->t;  /* Temporarily flip types. */
+	  irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT;
+	  left = asm_fuseload(as, lref, RSET_GPR);
+	  irl->t = origt;
+	  if (left == RID_MRM) {  /* Fusion succeeded? */
+	    asm_guardcc(as, cc);
+	    emit_i8(as, imm);
+	    emit_mrm(as, XO_ARITHib, XOg_CMP, RID_MRM);
+	    return;
+	  }  /* Otherwise handle register case as usual. */
+	} else {
+	  left = asm_fuseload(as, lref, RSET_GPR);
+	}
+	asm_guardcc(as, cc);
+	if (usetest && left != RID_MRM) {
+	  /* Use test r,r instead of cmp r,0. */
+	  if (irl+1 == ir)  /* Referencing previous ins? */
+	    as->testmcp = as->mcp;  /* Set flag to drop test r,r if possible. */
+	  emit_rr(as, XO_TEST, left, left);
+	} else {
+	  x86Op xo;
+	  if (checki8(imm)) {
+	    emit_i8(as, imm);
+	    xo = XO_ARITHi8;
+	  } else {
+	    emit_i32(as, imm);
+	    xo = XO_ARITHi;
+	  }
+	  emit_mrm(as, xo, XOg_CMP, left);
+	}
+      }
+    } else {
+      Reg left = ra_alloc1(as, lref, RSET_GPR);
+      Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left));
+      asm_guardcc(as, cc);
+      emit_mrm(as, XO_CMP, left, right);
+    }
+  } else {  /* Handle ordered string compares. */
+    RegSet allow = RSET_GPR;
+    /* This assumes lj_str_cmp never uses any SSE registers. */
+    ra_evictset(as, (RSET_SCRATCH & RSET_GPR));
+    asm_guardcc(as, cc);
+    emit_rr(as, XO_TEST, RID_RET, RID_RET);
+    emit_call(as, lj_str_cmp);  /* int32_t lj_str_cmp(GCstr *a, GCstr *b) */
+    if (irref_isk(ir->op1)) {
+      emit_setargi(as, 1, IR(ir->op1)->i);
+    } else {
+      Reg left = ra_alloc1(as, ir->op1, allow);
+      rset_clear(allow, left);
+      emit_setargr(as, 1, left);
+    }
+    if (irref_isk(ir->op2)) {
+      emit_setargi(as, 2, IR(ir->op2)->i);
+    } else {
+      Reg right = ra_alloc1(as, ir->op2, allow);
+      emit_setargr(as, 2, right);
+    }
+  }
+}
+
+#define asm_comp(as, ir, ci, cf, cu) \
+  asm_comp_(as, ir, (ci)+((cf)<<4)+(cu))
+
+/* -- GC handling --------------------------------------------------------- */
+
+/* Sync all live GC values to Lua stack slots. */
+static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base, RegSet allow)
+{
+  IRRef2 *map = &as->T->snapmap[snap->mapofs];
+  BCReg s, nslots = snap->nslots;
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (!irref_isk(ref)) {
+      IRIns *ir = IR(ref);
+      if (ir->o == IR_FRAME) {
+	/* NYI: sync the frame, bump base, set topslot, clear new slots. */
+	lj_trace_err(as->J, LJ_TRERR_NYIGCF);
+      } else if (irt_isgcv(ir->t) &&
+	       !(ir->o == IR_SLOAD && ir->op1 < nslots && map[ir->op1] == 0)) {
+	Reg src = ra_alloc1(as, ref, allow);
+	int32_t ofs = 8*(int32_t)(s-1);
+	emit_movtomro(as, src, base, ofs);
+	emit_movmroi(as, base, ofs+4, irt_toitype(ir->t));
+	checkmclim(as);
+      }
+    }
+  }
+}
+
+/* Check GC threshold and do one or more GC steps. */
+static void asm_gc_check(ASMState *as, SnapShot *snap)
+{
+  MCLabel l_end;
+  const BCIns *pc;
+  Reg tmp, base;
+  RegSet drop = RSET_SCRATCH;
+  /* Must evict BASE because the stack may be reallocated by the GC. */
+  if (ra_hasreg(IR(REF_BASE)->r))
+    drop |= RID2RSET(IR(REF_BASE)->r);
+  ra_evictset(as, drop);
+  base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_GPR, RID_RET));
+  l_end = emit_label(as);
+  /* void lj_gc_step_jit(lua_State *L, const BCIns *pc, MSize steps) */
+  emit_call(as, lj_gc_step_jit);
+  emit_movtomro(as, base, RID_RET, offsetof(lua_State, base));
+  emit_setargr(as, 1, RID_RET);
+  emit_setargi(as, 3, (int32_t)as->gcsteps);
+  emit_getgl(as, RID_RET, jit_L);
+  pc = (const BCIns *)(uintptr_t)as->T->snapmap[snap->mapofs+snap->nslots];
+  emit_setargp(as, 2, pc);
+  asm_gc_sync(as, snap, base, rset_exclude(RSET_SCRATCH & RSET_GPR, base));
+  if (as->curins == as->loopref)  /* BASE gets restored by LOOP anyway. */
+    ra_restore(as, REF_BASE);  /* Better do it inside the slow path. */
+  /* Jump around GC step if GC total < GC threshold. */
+  tmp = ra_scratch(as, RSET_SCRATCH & RSET_GPR);
+  emit_sjcc(as, CC_B, l_end);
+  emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold);
+  emit_getgl(as, tmp, gc.total);
+  as->gcsteps = 0;
+  checkmclim(as);
+}
+
+/* -- PHI and loop handling ----------------------------------------------- */
+
+/* Break a PHI cycle by renaming to a free register (evict if needed). */
+static void asm_phi_break(ASMState *as, RegSet blocked, RegSet blockedby,
+			  RegSet allow)
+{
+  RegSet candidates = blocked & allow;
+  if (candidates) {  /* If this register file has candidates. */
+    /* Note: the set for ra_pick cannot be empty, since each register file
+    ** has some registers never allocated to PHIs.
+    */
+    Reg down, up = ra_pick(as, ~blocked & allow);  /* Get a free register. */
+    if (candidates & ~blockedby)  /* Optimize shifts, else it's a cycle. */
+      candidates = candidates & ~blockedby;
+    down = rset_picktop(candidates);  /* Pick candidate PHI register. */
+    ra_rename(as, down, up);  /* And rename it to the free register. */
+  }
+}
+
+/* PHI register shuffling.
+**
+** The allocator tries hard to preserve PHI register assignments across
+** the loop body. Most of the time this loop does nothing, since there
+** are no register mismatches.
+**
+** If a register mismatch is detected and ...
+** - the register is currently free: rename it.
+** - the register is blocked by an invariant: restore/remat and rename it.
+** - Otherwise the register is used by another PHI, so mark it as blocked.
+**
+** The renames are order-sensitive, so just retry the loop if a register
+** is marked as blocked, but has been freed in the meantime. A cycle is
+** detected if all of the blocked registers are allocated. To break the
+** cycle rename one of them to a free register and retry.
+**
+** Note that PHI spill slots are kept in sync and don't need to be shuffled.
+*/
+static void asm_phi_shuffle(ASMState *as)
+{
+  RegSet work;
+
+  /* Find and resolve PHI register mismatches. */
+  for (;;) {
+    RegSet blocked = RSET_EMPTY;
+    RegSet blockedby = RSET_EMPTY;
+    RegSet phiset = as->phiset;
+    while (phiset) {  /* Check all left PHI operand registers. */
+      Reg r = rset_picktop(phiset);
+      IRIns *irl = IR(as->phireg[r]);
+      Reg left = irl->r;
+      if (r != left) {  /* Mismatch? */
+	if (!rset_test(as->freeset, r)) {  /* PHI register blocked? */
+	  IRRef ref = regcost_ref(as->cost[r]);
+	  if (irt_ismarked(IR(ref)->t)) {  /* Blocked by other PHI (w/reg)? */
+	    rset_set(blocked, r);
+	    if (ra_hasreg(left))
+	      rset_set(blockedby, left);
+	    left = RID_NONE;
+	  } else {  /* Otherwise grab register from invariant. */
+	    ra_restore(as, ref);
+	    checkmclim(as);
+	  }
+	}
+	if (ra_hasreg(left)) {
+	  ra_rename(as, left, r);
+	  checkmclim(as);
+	}
+      }
+      rset_clear(phiset, r);
+    }
+    if (!blocked) break;  /* Finished. */
+    if (!(as->freeset & blocked)) {  /* Break cycles if none are free. */
+      asm_phi_break(as, blocked, blockedby, RSET_GPR);
+      asm_phi_break(as, blocked, blockedby, RSET_FPR);
+      checkmclim(as);
+    }  /* Else retry some more renames. */
+  }
+
+  /* Restore/remat invariants whose registers are modified inside the loop. */
+  work = as->modset & ~(as->freeset | as->phiset);
+  while (work) {
+    Reg r = rset_picktop(work);
+    ra_restore(as, regcost_ref(as->cost[r]));
+    rset_clear(work, r);
+    checkmclim(as);
+  }
+
+  /* Allocate and save all unsaved PHI regs and clear marks. */
+  work = as->phiset;
+  while (work) {
+    Reg r = rset_picktop(work);
+    IRRef lref = as->phireg[r];
+    IRIns *ir = IR(lref);
+    if (ra_hasspill(ir->s)) {  /* Left PHI gained a spill slot? */
+      irt_clearmark(ir->t);  /* Handled here, so clear marker now. */
+      ra_alloc1(as, lref, RID2RSET(r));
+      ra_save(as, ir, r);  /* Save to spill slot inside the loop. */
+      checkmclim(as);
+    }
+    rset_clear(work, r);
+  }
+}
+
+/* Emit renames for left PHIs which are only spilled outside the loop. */
+static void asm_phi_fixup(ASMState *as)
+{
+  RegSet work = as->phiset;
+  while (work) {
+    Reg r = rset_picktop(work);
+    IRRef lref = as->phireg[r];
+    IRIns *ir = IR(lref);
+    /* Left PHI gained a spill slot before the loop? */
+    if (irt_ismarked(ir->t) && ra_hasspill(ir->s)) {
+      IRRef ren;
+      lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), lref, as->loopsnapno);
+      ren = tref_ref(lj_ir_emit(as->J));
+      as->ir = as->T->ir;  /* The IR may have been reallocated. */
+      IR(ren)->r = (uint8_t)r;
+      IR(ren)->s = SPS_NONE;
+    }
+    irt_clearmark(ir->t);  /* Always clear marker. */
+    rset_clear(work, r);
+  }
+}
+
+/* Setup right PHI reference. */
+static void asm_phi(ASMState *as, IRIns *ir)
+{
+  RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+  RegSet afree = (as->freeset & allow);
+  IRIns *irl = IR(ir->op1);
+  IRIns *irr = IR(ir->op2);
+  /* Spill slot shuffling is not implemented yet (but rarely needed). */
+  if (ra_hasspill(irl->s) || ra_hasspill(irr->s))
+    lj_trace_err(as->J, LJ_TRERR_NYIPHI);
+  /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */
+  if ((afree & (afree-1))) {  /* Two or more free registers? */
+    Reg r;
+    if (ra_noreg(irr->r)) {  /* Get a register for the right PHI. */
+      r = ra_allocref(as, ir->op2, allow);
+    } else {  /* Duplicate right PHI, need a copy (rare). */
+      r = ra_scratch(as, allow);
+      emit_movrr(as, r, irr->r);
+    }
+    ir->r = (uint8_t)r;
+    rset_set(as->phiset, r);
+    as->phireg[r] = (IRRef1)ir->op1;
+    irt_setmark(irl->t);  /* Marks left PHIs _with_ register. */
+    if (ra_noreg(irl->r))
+      ra_sethint(irl->r, r); /* Set register hint for left PHI. */
+  } else {  /* Otherwise allocate a spill slot. */
+    /* This is overly restrictive, but it triggers only on synthetic code. */
+    if (ra_hasreg(irl->r) || ra_hasreg(irr->r))
+      lj_trace_err(as->J, LJ_TRERR_NYIPHI);
+    ra_spill(as, ir);
+    irl->s = irr->s = ir->s;  /* Sync left/right PHI spill slots. */
+  }
+}
+
+/* Fixup the loop branch. */
+static void asm_loop_fixup(ASMState *as)
+{
+  MCode *p = as->mctop;
+  MCode *target = as->mcp;
+  if (as->realign) {  /* Realigned loops use short jumps. */
+    as->realign = NULL;  /* Stop another retry. */
+    lua_assert(((intptr_t)target & 15) == 0);
+    if (as->loopinv) {  /* Inverted loop branch? */
+      p -= 5;
+      p[0] = XI_JMP;
+      lua_assert(target - p >= -128);
+      p[-1] = (MCode)(target - p);  /* Patch sjcc. */
+      if (as->loopinv == 2)
+	p[-3] = (MCode)(target - p + 2);  /* Patch opt. short jp. */
+    } else {
+      lua_assert(target - p >= -128);
+      p[-1] = (MCode)(int8_t)(target - p);  /* Patch short jmp. */
+      p[-2] = XI_JMPs;
+    }
+  } else {
+    MCode *newloop;
+    p[-5] = XI_JMP;
+    if (as->loopinv) {  /* Inverted loop branch? */
+      /* asm_guardcc already inverted the jcc and patched the jmp. */
+      p -= 5;
+      newloop = target+4;
+      *(int32_t *)(p-4) = (int32_t)(target - p);  /* Patch jcc. */
+      if (as->loopinv == 2) {
+	*(int32_t *)(p-10) = (int32_t)(target - p + 6);  /* Patch opt. jp. */
+	newloop = target+8;
+      }
+    } else {  /* Otherwise just patch jmp. */
+      *(int32_t *)(p-4) = (int32_t)(target - p);
+      newloop = target+3;
+    }
+    /* Realign small loops and shorten the loop branch. */
+    if (newloop >= p - 128) {
+      as->realign = newloop;  /* Force a retry and remember alignment. */
+      as->curins = as->stopins;  /* Abort asm_trace now. */
+      as->T->nins = as->orignins;  /* Remove any added renames. */
+    }
+  }
+}
+
+/* Middle part of a loop. */
+static void asm_loop(ASMState *as)
+{
+  /* LOOP is a guard, so the snapno is up to date. */
+  as->loopsnapno = as->snapno;
+  if (as->gcsteps)
+    asm_gc_check(as, &as->T->snap[as->loopsnapno]);
+  /* LOOP marks the transition from the variant to the invariant part. */
+  as->testmcp = as->invmcp = NULL;
+  as->sectref = 0;
+  if (!neverfuse(as)) as->fuseref = 0;
+  asm_phi_shuffle(as);
+  asm_loop_fixup(as);
+  as->mcloop = as->mcp;
+  RA_DBGX((as, "===== LOOP ====="));
+  if (!as->realign) RA_DBG_FLUSH();
+}
+
+/* -- Head of trace ------------------------------------------------------- */
+
+/* Rematerialize all remaining constants in registers. */
+static void asm_const_remat(ASMState *as)
+{
+  RegSet work = ~as->freeset & RSET_ALL;
+  while (work) {
+    Reg r = rset_pickbot(work);
+    IRRef ref = regcost_ref(as->cost[r]);
+    if (irref_isk(ref) || ref == REF_BASE) {
+      ra_rematk(as, IR(ref));
+      checkmclim(as);
+    }
+    rset_clear(work, r);
+  }
+}
+
+/* Head of a root trace. */
+static void asm_head_root(ASMState *as)
+{
+  int32_t spadj;
+  emit_setgli(as, vmstate, (int32_t)as->J->curtrace);
+  spadj = sps_adjust(as);
+  as->T->spadjust = (uint16_t)spadj;
+  emit_addptr(as, RID_ESP, -spadj);
+}
+
+/* Handle BASE coalescing for a root trace. */
+static void asm_head_base(ASMState *as)
+{
+  IRIns *ir = IR(REF_BASE);
+  Reg r = ir->r;
+  lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
+  ra_free(as, r);
+  if (r != RID_BASE) {
+    ra_scratch(as, RID2RSET(RID_BASE));
+    emit_rr(as, XO_MOV, r, RID_BASE);
+  }
+}
+
+/* Check Lua stack size for overflow at the start of a side trace.
+** Stack overflow is rare, so let the regular exit handling fix this up.
+** This is done in the context of the *parent* trace and parent exitno!
+*/
+static void asm_checkstack(ASMState *as, RegSet allow)
+{
+  /* Try to get an unused temp. register, otherwise spill/restore eax. */
+  Reg r = allow ? rset_pickbot(allow) : RID_EAX;
+  emit_jcc(as, CC_B, exitstub_addr(as->J, as->J->exitno));
+  if (allow == RSET_EMPTY)  /* Restore temp. register. */
+    emit_rmro(as, XO_MOV, r, RID_ESP, sps_scale(SPS_TEMP1));
+  emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*as->topslot));
+  emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, ptr2addr(&J2G(as->J)->jit_base));
+  emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack));
+  emit_getgl(as, r, jit_L);
+  if (allow == RSET_EMPTY)  /* Spill temp. register. */
+    emit_rmro(as, XO_MOVto, r, RID_ESP, sps_scale(SPS_TEMP1));
+}
+
+/* Head of a side trace.
+**
+** The current simplistic algorithm requires that all slots inherited
+** from the parent are live in a register between pass 2 and pass 3. This
+** avoids the complexity of stack slot shuffling. But of course this may
+** overflow the register set in some cases and cause the dreaded error:
+** "NYI: register coalescing too complex". A refined algorithm is needed.
+*/
+static void asm_head_side(ASMState *as)
+{
+  IRRef1 sloadins[RID_MAX];
+  RegSet allow = RSET_ALL;  /* Inverse of all coalesced registers. */
+  RegSet live = RSET_EMPTY;  /* Live parent registers. */
+  int32_t spadj, spdelta;
+  int pass2 = 0;
+  int pass3 = 0;
+  IRRef i;
+
+  /* Scan all parent SLOADs and collect register dependencies. */
+  for (i = as->curins; i > REF_BASE; i--) {
+    IRIns *ir = IR(i);
+    lua_assert((ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_PARENT)) ||
+	       ir->o == IR_FRAME);
+    if (ir->o == IR_SLOAD) {
+      RegSP rs = as->parentmap[ir->op1];
+      if (ra_hasreg(ir->r)) {
+	rset_clear(allow, ir->r);
+	if (ra_hasspill(ir->s))
+	  ra_save(as, ir, ir->r);
+      } else if (ra_hasspill(ir->s)) {
+	irt_setmark(ir->t);
+	pass2 = 1;
+      }
+      if (ir->r == rs) {  /* Coalesce matching registers right now. */
+	ra_free(as, ir->r);
+      } else if (ra_hasspill(regsp_spill(rs))) {
+	if (ra_hasreg(ir->r))
+	  pass3 = 1;
+      } else if (ra_used(ir)) {
+	sloadins[rs] = (IRRef1)i;
+	rset_set(live, rs);  /* Block live parent register. */
+      }
+    }
+  }
+
+  /* Calculate stack frame adjustment. */
+  spadj = sps_adjust(as);
+  spdelta = spadj - (int32_t)as->parent->spadjust;
+  if (spdelta < 0) {  /* Don't shrink the stack frame. */
+    spadj = (int32_t)as->parent->spadjust;
+    spdelta = 0;
+  }
+  as->T->spadjust = (uint16_t)spadj;
+
+  /* Reload spilled target registers. */
+  if (pass2) {
+    for (i = as->curins; i > REF_BASE; i--) {
+      IRIns *ir = IR(i);
+      if (irt_ismarked(ir->t)) {
+	RegSet mask;
+	Reg r;
+	RegSP rs;
+	irt_clearmark(ir->t);
+	rs = as->parentmap[ir->op1];
+	if (!ra_hasspill(regsp_spill(rs)))
+	  ra_sethint(ir->r, rs);  /* Hint may be gone, set it again. */
+	else if (sps_scale(regsp_spill(rs))+spdelta == sps_scale(ir->s))
+	  continue;  /* Same spill slot, do nothing. */
+	mask = (irt_isnum(ir->t) ? RSET_FPR : RSET_GPR) & allow;
+	if (mask == RSET_EMPTY)
+	  lj_trace_err(as->J, LJ_TRERR_NYICOAL);
+	r = ra_allocref(as, i, mask);
+	ra_save(as, ir, r);
+	rset_clear(allow, r);
+	if (r == rs) {  /* Coalesce matching registers right now. */
+	  ra_free(as, r);
+	  rset_clear(live, r);
+	} else if (ra_hasspill(regsp_spill(rs))) {
+	  pass3 = 1;
+	}
+	checkmclim(as);
+      }
+    }
+  }
+
+  /* Store trace number and adjust stack frame relative to the parent. */
+  emit_setgli(as, vmstate, (int32_t)as->J->curtrace);
+  emit_addptr(as, RID_ESP, -spdelta);
+
+  /* Restore target registers from parent spill slots. */
+  if (pass3) {
+    RegSet work = ~as->freeset & RSET_ALL;
+    while (work) {
+      Reg r = rset_pickbot(work);
+      IRIns *ir = IR(regcost_ref(as->cost[r]));
+      RegSP rs = as->parentmap[ir->op1];
+      rset_clear(work, r);
+      if (ra_hasspill(regsp_spill(rs))) {
+	int32_t ofs = sps_scale(regsp_spill(rs));
+	ra_free(as, r);
+	emit_movrmro(as, r, RID_ESP, ofs);
+	checkmclim(as);
+      }
+    }
+  }
+
+  /* Shuffle registers to match up target regs with parent regs. */
+  for (;;) {
+    RegSet work;
+
+    /* Repeatedly coalesce free live registers by moving to their target. */
+    while ((work = as->freeset & live) != RSET_EMPTY) {
+      Reg rp = rset_pickbot(work);
+      IRIns *ir = IR(sloadins[rp]);
+      rset_clear(live, rp);
+      rset_clear(allow, rp);
+      ra_free(as, ir->r);
+      emit_movrr(as, ir->r, rp);
+      checkmclim(as);
+    }
+
+    /* We're done if no live registers remain. */
+    if (live == RSET_EMPTY)
+      break;
+
+    /* Break cycles by renaming one target to a temp. register. */
+    if (live & RSET_GPR) {
+      RegSet tmpset = as->freeset & ~live & allow & RSET_GPR;
+      if (tmpset == RSET_EMPTY)
+	lj_trace_err(as->J, LJ_TRERR_NYICOAL);
+      ra_rename(as, rset_pickbot(live & RSET_GPR), rset_pickbot(tmpset));
+    }
+    if (live & RSET_FPR) {
+      RegSet tmpset = as->freeset & ~live & allow & RSET_FPR;
+      if (tmpset == RSET_EMPTY)
+	lj_trace_err(as->J, LJ_TRERR_NYICOAL);
+      ra_rename(as, rset_pickbot(live & RSET_FPR), rset_pickbot(tmpset));
+    }
+    checkmclim(as);
+    /* Continue with coalescing to fix up the broken cycle(s). */
+  }
+
+  /* Check Lua stack size if frames have been added. */
+  if (as->topslot)
+    asm_checkstack(as, allow & RSET_GPR);
+}
+
+/* -- Tail of trace ------------------------------------------------------- */
+
+/* Sync Lua stack slots to match the last snapshot.
+** Note: code generation is backwards, so this is best read bottom-up.
+*/
+static void asm_tail_sync(ASMState *as)
+{
+  SnapShot *snap = &as->T->snap[as->T->nsnap-1];  /* Last snapshot. */
+  BCReg s, nslots = snap->nslots;
+  IRRef2 *map = &as->T->snapmap[snap->mapofs];
+  IRRef2 *flinks = map + nslots + snap->nframelinks;
+  BCReg newbase = 0;
+  BCReg secondbase = ~(BCReg)0;
+  BCReg topslot = 0;
+
+  checkmclim(as);
+  ra_allocref(as, REF_BASE, RID2RSET(RID_BASE));
+
+  /* Must check all frames to find topslot (outer can be larger than inner). */
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (!irref_isk(ref)) {
+      IRIns *ir = IR(ref);
+      if (ir->o == IR_FRAME && irt_isfunc(ir->t)) {
+	GCfunc *fn = ir_kfunc(IR(ir->op2));
+	if (isluafunc(fn)) {
+	  BCReg fs = s + funcproto(fn)->framesize;
+	  newbase = s;
+	  if (secondbase == ~(BCReg)0) secondbase = s;
+	  if (fs > topslot) topslot = fs;
+	}
+      }
+    }
+  }
+  as->topslot = topslot;  /* Used in asm_head_side(). */
+
+  if (as->T->link == TRACE_INTERP) {
+    /* Setup fixed registers for exit to interpreter. */
+    emit_loada(as, RID_DISPATCH, J2GG(as->J)->dispatch);
+    emit_loadi(as, RID_PC, (int32_t)map[nslots]);
+  } else if (newbase) {
+    /* Save modified BASE for linking to trace with higher start frame. */
+    emit_setgl(as, RID_BASE, jit_base);
+  }
+
+  emit_addptr(as, RID_BASE, 8*(int32_t)newbase);
+
+  /* Clear stack slots of newly added frames. */
+  if (nslots <= topslot) {
+    if (nslots < topslot) {
+      for (s = nslots; s <= topslot; s++) {
+	emit_movtomro(as, RID_EAX, RID_BASE, 8*(int32_t)s-4);
+	checkmclim(as);
+      }
+      emit_loadi(as, RID_EAX, LJ_TNIL);
+    } else {
+      emit_movmroi(as, RID_BASE, 8*(int32_t)nslots-4, LJ_TNIL);
+    }
+  }
+
+  /* Store the value of all modified slots to the Lua stack. */
+  for (s = 0; s < nslots; s++) {
+    int32_t ofs = 8*((int32_t)s-1);
+    IRRef ref = snap_ref(map[s]);
+    if (ref) {
+      IRIns *ir = IR(ref);
+      /* No need to restore readonly slots and unmodified non-parent slots. */
+      if (ir->o == IR_SLOAD && ir->op1 == s &&
+	  (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
+	continue;
+      if (irt_isnum(ir->t)) {
+	Reg src = ra_alloc1(as, ref, RSET_FPR);
+	emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
+      } else if (ir->o == IR_FRAME) {
+	emit_movmroi(as, RID_BASE, ofs, ptr2addr(ir_kgc(IR(ir->op2))));
+	if (s != 0)  /* Do not overwrite link to previous frame. */
+	  emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*--flinks));
+      } else {
+	lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
+	if (!irref_isk(ref)) {
+	  Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
+	  emit_movtomro(as, src, RID_BASE, ofs);
+	} else if (!irt_ispri(ir->t)) {
+	  emit_movmroi(as, RID_BASE, ofs, ir->i);
+	}
+	emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
+      }
+    } else if (s > secondbase) {
+      emit_movmroi(as, RID_BASE, ofs+4, LJ_TNIL);
+    }
+    checkmclim(as);
+  }
+  lua_assert(map + nslots == flinks-1);
+}
+
+/* Fixup the tail code. */
+static void asm_tail_fixup(ASMState *as, TraceNo lnk)
+{
+  /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
+  MCode *p = as->mctop;
+  MCode *target, *q;
+  int32_t spadj = as->T->spadjust;
+  if (spadj == 0) {
+    p -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6;
+  } else {
+    MCode *p1;
+    /* Patch stack adjustment. */
+    if (checki8(spadj)) {
+      p -= 3;
+      p1 = p-6;
+      *p1 = (MCode)spadj;
+    } else {
+      p1 = p-9;
+      *(int32_t *)p1 = spadj;
+    }
+    if ((as->flags & JIT_F_LEA_AGU)) {
+      p1[-3] = (MCode)XI_LEA;
+      p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
+      p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
+    } else {
+      p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
+      p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
+    }
+  }
+  /* Patch exit branch. */
+  target = lnk == TRACE_INTERP ? (MCode *)lj_vm_exit_interp :
+				 as->J->trace[lnk]->mcode;
+  *(int32_t *)(p-4) = (int32_t)(target - p);
+  p[-5] = XI_JMP;
+  /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
+  for (q = as->mctop-1; q >= p; q--)
+    *q = XI_NOP;
+  as->mctop = p;
+}
+
+/* -- Instruction dispatch ------------------------------------------------ */
+
+/* Assemble a single instruction. */
+static void asm_ir(ASMState *as, IRIns *ir)
+{
+  switch ((IROp)ir->o) {
+  /* Miscellaneous ops. */
+  case IR_LOOP: asm_loop(as); break;
+  case IR_NOP: break;
+  case IR_PHI: asm_phi(as, ir); break;
+
+  /* Guarded assertions. */
+  case IR_LT:  asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break;
+  case IR_GE:  asm_comp(as, ir, CC_L,  CC_B,  0); break;
+  case IR_LE:  asm_comp(as, ir, CC_G,  CC_A,  VCC_PS); break;
+  case IR_GT:  asm_comp(as, ir, CC_LE, CC_BE, 0); break;
+  case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break;
+  case IR_UGE: asm_comp(as, ir, CC_B,  CC_B,  VCC_U|VCC_PS); break;
+  case IR_ULE: asm_comp(as, ir, CC_A,  CC_A,  VCC_U); break;
+  case IR_ABC:
+  case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
+
+  case IR_FRAME:
+    if (ir->op1 == ir->op2) break;  /* No check needed for placeholder. */
+    /* fallthrough */
+  case IR_EQ:  asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
+  case IR_NE:  asm_comp(as, ir, CC_E,  CC_E,  VCC_U|VCC_P); break;
+
+  /* Bit ops. */
+  case IR_BNOT: asm_bitnot(as, ir); break;
+  case IR_BSWAP: asm_bitswap(as, ir); break;
+
+  case IR_BAND: asm_intarith(as, ir, XOg_AND); break;
+  case IR_BOR:  asm_intarith(as, ir, XOg_OR); break;
+  case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break;
+
+  case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break;
+  case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break;
+  case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break;
+  case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break;
+  case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break;
+
+  /* Arithmetic ops. */
+  case IR_ADD: asm_add(as, ir); break;
+  case IR_SUB:
+    if (irt_isnum(ir->t))
+      asm_fparith(as, ir, XO_SUBSD);
+    else  /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
+      asm_intarith(as, ir, XOg_SUB);
+    break;
+  case IR_MUL: asm_fparith(as, ir, XO_MULSD); break;
+  case IR_DIV: asm_fparith(as, ir, XO_DIVSD); break;
+
+  case IR_NEG: asm_fparith(as, ir, XO_XORPS); break;
+  case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break;
+
+  case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
+  case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
+
+  case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI:
+    asm_fpmath(as, ir);
+    break;
+
+  /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
+  case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
+  case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break;
+
+  /* Memory references. */
+  case IR_AREF: asm_aref(as, ir); break;
+  case IR_HREF: asm_href(as, ir); break;
+  case IR_HREFK: asm_hrefk(as, ir); break;
+  case IR_NEWREF: asm_newref(as, ir); break;
+  case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
+  case IR_FREF: asm_fref(as, ir); break;
+  case IR_STRREF: asm_strref(as, ir); break;
+
+  /* Loads and stores. */
+  case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: asm_ahuload(as, ir); break;
+  case IR_FLOAD: asm_fload(as, ir); break;
+  case IR_SLOAD: asm_sload(as, ir); break;
+  case IR_XLOAD: asm_xload(as, ir); break;
+
+  case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
+  case IR_FSTORE: asm_fstore(as, ir); break;
+
+  /* String ops. */
+  case IR_SNEW: asm_snew(as, ir); break;
+
+  /* Table ops. */
+  case IR_TNEW: asm_tnew(as, ir); break;
+  case IR_TDUP: asm_tdup(as, ir); break;
+  case IR_TLEN: asm_tlen(as, ir); break;
+  case IR_TBAR: asm_tbar(as, ir); break;
+  case IR_OBAR: asm_obar(as, ir); break;
+
+  /* Type conversions. */
+  case IR_TONUM: asm_tonum(as, ir); break;
+  case IR_TOINT:
+    if (irt_isguard(ir->t))
+      asm_tointg(as, ir, ra_alloc1(as, ir->op1, RSET_FPR));
+    else
+      asm_toint(as, ir); break;
+    break;
+  case IR_TOBIT: asm_tobit(as, ir); break;
+  case IR_TOSTR: asm_tostr(as, ir); break;
+  case IR_STRTO: asm_strto(as, ir); break;
+
+  default:
+    setintV(&as->J->errinfo, ir->o);
+    lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
+    break;
+  }
+}
+
+/* Assemble a trace in linear backwards order. */
+static void asm_trace(ASMState *as)
+{
+  for (as->curins--; as->curins > as->stopins; as->curins--) {
+    IRIns *ir = IR(as->curins);
+    if (irt_isguard(ir->t))
+      asm_snap_prep(as);
+    else if (!ra_used(ir) && !irm_sideeff(lj_ir_mode[ir->o]) &&
+	     (as->flags & JIT_F_OPT_DCE))
+      continue;  /* Dead-code elimination can be soooo easy. */
+    RA_DBG_REF();
+    checkmclim(as);
+    asm_ir(as, ir);
+  }
+}
+
+/* -- Trace setup --------------------------------------------------------- */
+
+/* Clear reg/sp for all instructions and add register hints. */
+static void asm_setup_regsp(ASMState *as, Trace *T)
+{
+  IRRef i, nins;
+  int inloop;
+
+  /* Clear reg/sp for constants. */
+  for (i = T->nk; i < REF_BIAS; i++)
+    IR(i)->prev = REGSP_INIT;
+
+  /* REF_BASE is used for implicit references to the BASE register. */
+  IR(REF_BASE)->prev = REGSP_HINT(RID_BASE);
+
+  nins = T->nins;
+  if (IR(nins-1)->o == IR_RENAME) {
+    do { nins--; } while (IR(nins-1)->o == IR_RENAME);
+    T->nins = nins;  /* Remove any renames left over from ASM restart. */
+  }
+  as->snaprename = nins;
+  as->snapref = nins;
+  as->snapno = T->nsnap;
+
+  as->stopins = REF_BASE;
+  as->orignins = nins;
+  as->curins = nins;
+
+  inloop = 0;
+  for (i = REF_FIRST; i < nins; i++) {
+    IRIns *ir = IR(i);
+    switch (ir->o) {
+    case IR_LOOP:
+      inloop = 1;
+      break;
+    /* Set hints for slot loads from a parent trace. */
+    case IR_SLOAD:
+      if ((ir->op2 & IRSLOAD_PARENT)) {
+	RegSP rs = as->parentmap[ir->op1];
+	lua_assert(regsp_used(rs));
+	as->stopins = i;
+	if (!ra_hasspill(regsp_spill(rs)) && ra_hasreg(regsp_reg(rs))) {
+	  ir->prev = (uint16_t)REGSP_HINT(regsp_reg(rs));
+	  continue;
+	}
+      }
+      break;
+    case IR_FRAME:
+      if (i == as->stopins+1 && ir->op1 == ir->op2)
+	as->stopins++;
+      break;
+    /* C calls evict all scratch regs and return results in RID_RET. */
+    case IR_SNEW: case IR_TNEW: case IR_TDUP: case IR_TLEN: case IR_TOSTR:
+    case IR_NEWREF:
+      ir->prev = REGSP_HINT(RID_RET);
+      if (inloop)
+	as->modset = RSET_SCRATCH;
+      continue;
+    case IR_STRTO: case IR_OBAR:
+      if (inloop)
+	as->modset = RSET_SCRATCH;
+      break;
+    /* Ordered string compares evict all integer scratch registers. */
+    case IR_LT: case IR_GE: case IR_LE: case IR_GT:
+      if (irt_isstr(ir->t) && inloop)
+	as->modset |= (RSET_SCRATCH & RSET_GPR);
+      break;
+    /* Non-constant shift counts need to be in RID_ECX. */
+    case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR:
+      if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r))
+	IR(ir->op2)->r = REGSP_HINT(RID_ECX);
+      break;
+    /* Do not propagate hints across type conversions. */
+    case IR_TONUM: case IR_TOINT: case IR_TOBIT:
+      break;
+    default:
+      /* Propagate hints across likely 'op reg, imm' or 'op reg'. */
+      if (irref_isk(ir->op2) && !irref_isk(ir->op1)) {
+	ir->prev = IR(ir->op1)->prev;
+	continue;
+      }
+      break;
+    }
+    ir->prev = REGSP_INIT;
+  }
+}
+
+/* -- Assembler core ------------------------------------------------------ */
+
+/* Define this if you want to run LuaJIT with Valgrind. */
+#ifdef LUAJIT_USE_VALGRIND
+#include <valgrind/valgrind.h>
+#define VG_INVALIDATE(p, sz)	VALGRIND_DISCARD_TRANSLATIONS(p, sz)
+#else
+#define VG_INVALIDATE(p, sz)	((void)0)
+#endif
+
+/* Assemble a trace. */
+void lj_asm_trace(jit_State *J, Trace *T)
+{
+  ASMState as_;
+  ASMState *as = &as_;
+
+  /* Setup initial state. Copy some fields to reduce indirections. */
+  as->J = J;
+  as->T = T;
+  as->ir = T->ir;
+  as->flags = J->flags;
+  as->loopref = J->loopref;
+  as->realign = NULL;
+  as->loopinv = 0;
+  if (J->parent) {
+    as->parent = J->trace[J->parent];
+    lj_snap_regspmap(as->parentmap, as->parent, J->exitno);
+  } else {
+    as->parent = NULL;
+  }
+  as->mctop = lj_mcode_reserve(J, &as->mcbot);  /* Reserve MCode memory. */
+  as->mcp = as->mctop;
+  as->mclim = as->mcbot + MCLIM_REDZONE;
+  asm_exitstub_setup(as, T->nsnap);
+
+  do {
+    as->mcp = as->mctop;
+    as->curins = T->nins;
+    RA_DBG_START();
+    RA_DBGX((as, "===== STOP ====="));
+    /* Realign and leave room for backwards loop branch or exit branch. */
+    if (as->realign) {
+      int i = ((int)(intptr_t)as->realign) & 15;
+      MCode *p = as->mctop;
+      /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
+      while (i-- > 0)
+	*--p = XI_NOP;
+      as->mctop = p;
+      as->mcp = p - (as->loopinv ? 5 : 2);  /* Space for short/near jmp. */
+    } else {
+      as->mcp = as->mctop - 5;  /* Space for exit branch (near jmp). */
+    }
+    as->invmcp = as->mcp;
+    as->mcloop = NULL;
+    as->testmcp = NULL;
+    as->topslot = 0;
+    as->gcsteps = 0;
+    as->sectref = as->loopref;
+    as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED;
+
+    /* Setup register allocation. */
+    ra_setup(as);
+    asm_setup_regsp(as, T);
+
+    if (!as->loopref) {
+      /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
+      as->mcp -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6;
+      as->invmcp = NULL;
+      asm_tail_sync(as);
+    }
+    asm_trace(as);
+  } while (as->realign);  /* Retry in case the MCode needs to be realigned. */
+
+  RA_DBG_REF();
+  checkmclim(as);
+  if (as->gcsteps)
+    asm_gc_check(as, &as->T->snap[0]);
+  if (!J->parent)
+    asm_head_base(as);
+  asm_const_remat(as);
+  if (J->parent)
+    asm_head_side(as);
+  else
+    asm_head_root(as);
+  asm_phi_fixup(as);
+
+  RA_DBGX((as, "===== START ===="));
+  RA_DBG_FLUSH();
+  if (as->freeset != RSET_ALL)
+    lj_trace_err(as->J, LJ_TRERR_BADRA);  /* Ouch! Should never happen. */
+
+  /* Set trace entry point before fixing up tail to allow link to self. */
+  T->mcode = as->mcp;
+  T->mcloop = as->mcloop ? (MSize)(as->mcloop - as->mcp) : 0;
+  if (!as->loopref)
+    asm_tail_fixup(as, T->link);  /* Note: this may change as->mctop! */
+  T->szmcode = (MSize)(as->mctop - as->mcp);
+  VG_INVALIDATE(T->mcode, T->szmcode);
+}
+
+/* Patch exit jumps of existing machine code to a new target. */
+void lj_asm_patchexit(jit_State *J, Trace *T, ExitNo exitno, MCode *target)
+{
+  MCode *p = T->mcode;
+  MCode *mcarea = lj_mcode_patch(J, p, 0);
+  MSize len = T->szmcode;
+  MCode *px = exitstub_addr(J, exitno) - 6;
+  MCode *pe = p+len-6;
+  if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
+    *(int32_t *)(p+len-4) = (int32_t)(target - (p+len));
+  for (; p < pe; p++) {
+    if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) {
+      *(int32_t *)(p+2) = (int32_t)(target - (p+6));
+      p += 5;
+    }
+  }
+  lj_mcode_patch(J, mcarea, 1);
+  VG_INVALIDATE(T->mcode, T->szmcode);
+}
+
+#undef IR
+
+#endif

+ 17 - 0
src/lj_asm.h

@@ -0,0 +1,17 @@
+/*
+** IR assembler (SSA IR -> machine code).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_ASM_H
+#define _LJ_ASM_H
+
+#include "lj_jit.h"
+
+#if LJ_HASJIT
+LJ_FUNC void lj_asm_trace(jit_State *J, Trace *T);
+LJ_FUNC void lj_asm_patchexit(jit_State *J, Trace *T, ExitNo exitno,
+			      MCode *target);
+#endif
+
+#endif

+ 17 - 0
src/lj_bc.c

@@ -0,0 +1,17 @@
+/*
+** Bytecode instruction modes.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_bc_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_bc.h"
+
+/* Bytecode instruction modes. */
+LJ_DATADEF const uint16_t lj_bc_mode[BC__MAX+1] = {
+BCDEF(BCMODE)
+  0
+};
+

+ 235 - 0
src/lj_bc.h

@@ -0,0 +1,235 @@
+/*
+** Bytecode instruction format.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_BC_H
+#define _LJ_BC_H
+
+#include "lj_def.h"
+#include "lj_arch.h"
+
+/* Bytecode instruction format, 32 bit wide, fields of 8 or 16 bit:
+**
+** +----+----+----+----+
+** | B  | C  | A  | OP | Format ABC
+** +----+----+----+----+
+** |    D    | A  | OP | Format AD
+** +--------------------
+** MSB               LSB
+**
+** In-memory instructions are always stored in host byte order.
+*/
+
+/* Operand ranges and related constants. */
+#define BCMAX_A		0xff
+#define BCMAX_B		0xff
+#define BCMAX_C		0xff
+#define BCMAX_D		0xffff
+#define BCBIAS_J	0x8000
+#define NO_REG		BCMAX_A
+#define NO_JMP		(~(BCPos)0)
+
+/* Macros to get instruction fields. */
+#define bc_op(i)	(cast(BCOp, (i)&0xff))
+#define bc_a(i)		(cast(BCReg, ((i)>>8)&0xff))
+#define bc_b(i)		(cast(BCReg, (i)>>24))
+#define bc_c(i)		(cast(BCReg, ((i)>>16)&0xff))
+#define bc_d(i)		(cast(BCReg, (i)>>16))
+#define bc_j(i)		((ptrdiff_t)bc_d(i)-BCBIAS_J)
+
+/* Macros to set instruction fields. */
+#define setbc_byte(p, x, ofs) \
+  ((uint8_t *)(p))[LJ_ENDIAN_SELECT(ofs, 3-ofs)] = cast_byte(x)
+#define setbc_op(p, x)	setbc_byte(p, (x), 0)
+#define setbc_a(p, x)	setbc_byte(p, (x), 1)
+#define setbc_b(p, x)	setbc_byte(p, (x), 3)
+#define setbc_c(p, x)	setbc_byte(p, (x), 2)
+#define setbc_d(p, x) \
+  ((uint16_t *)(p))[LJ_ENDIAN_SELECT(1, 0)] = cast(uint16_t, (x))
+#define setbc_j(p, x)	setbc_d(p, (BCPos)((int32_t)(x)+BCBIAS_J))
+
+/* Macros to compose instructions. */
+#define BCINS_ABC(o, a, b, c) \
+  (cast(BCIns, o)|(cast(BCIns, a)<<8)|\
+  (cast(BCIns, b)<<24)|(cast(BCIns, c)<<16))
+#define BCINS_AD(o, a, d) \
+  (cast(BCIns, o)|(cast(BCIns, a)<<8)|(cast(BCIns, d)<<16))
+#define BCINS_AJ(o, a, j)	BCINS_AD(o, a, (BCPos)((int32_t)(j)+BCBIAS_J))
+
+/* Bytecode instruction definition. Order matters, see below.
+**
+** (name, filler, Amode, Bmode, Cmode or Dmode, metamethod)
+**
+** The opcode name suffixes specify the type for RB/RC or RD:
+** V = variable slot
+** S = string const
+** N = number const
+** P = primitive type (~itype)
+** B = unsigned byte literal
+** M = multiple args/results
+*/
+#define BCDEF(_) \
+  /* Comparison ops. ORDER OPR. */ \
+  _(ISLT,	var,	___,	var,	lt) \
+  _(ISGE,	var,	___,	var,	lt) \
+  _(ISLE,	var,	___,	var,	le) \
+  _(ISGT,	var,	___,	var,	le) \
+  \
+  _(ISEQV,	var,	___,	var,	eq) \
+  _(ISNEV,	var,	___,	var,	eq) \
+  _(ISEQS,	var,	___,	str,	eq) \
+  _(ISNES,	var,	___,	str,	eq) \
+  _(ISEQN,	var,	___,	num,	eq) \
+  _(ISNEN,	var,	___,	num,	eq) \
+  _(ISEQP,	var,	___,	pri,	eq) \
+  _(ISNEP,	var,	___,	pri,	eq) \
+  \
+  /* Unary test and copy ops. */ \
+  _(ISTC,	dst,	___,	var,	___) \
+  _(ISFC,	dst,	___,	var,	___) \
+  _(IST,	___,	___,	var,	___) \
+  _(ISF,	___,	___,	var,	___) \
+  \
+  /* Unary ops. */ \
+  _(MOV,	dst,	___,	var,	___) \
+  _(NOT,	dst,	___,	var,	___) \
+  _(UNM,	dst,	___,	var,	unm) \
+  _(LEN,	dst,	___,	var,	len) \
+  \
+  /* Binary ops. ORDER OPR. VV last, POW must be next. */ \
+  _(ADDVN,	dst,	var,	num,	add) \
+  _(SUBVN,	dst,	var,	num,	sub) \
+  _(MULVN,	dst,	var,	num,	mul) \
+  _(DIVVN,	dst,	var,	num,	div) \
+  _(MODVN,	dst,	var,	num,	mod) \
+  \
+  _(ADDNV,	dst,	var,	num,	add) \
+  _(SUBNV,	dst,	var,	num,	sub) \
+  _(MULNV,	dst,	var,	num,	mul) \
+  _(DIVNV,	dst,	var,	num,	div) \
+  _(MODNV,	dst,	var,	num,	mod) \
+  \
+  _(ADDVV,	dst,	var,	var,	add) \
+  _(SUBVV,	dst,	var,	var,	sub) \
+  _(MULVV,	dst,	var,	var,	mul) \
+  _(DIVVV,	dst,	var,	var,	div) \
+  _(MODVV,	dst,	var,	var,	mod) \
+  \
+  _(POW,	dst,	var,	var,	pow) \
+  _(CAT,	dst,	rbase,	rbase,	concat) \
+  \
+  /* Constant ops. */ \
+  _(KSTR,	dst,	___,	str,	___) \
+  _(KSHORT,	dst,	___,	lits,	___) \
+  _(KNUM,	dst,	___,	num,	___) \
+  _(KPRI,	dst,	___,	pri,	___) \
+  _(KNIL,	base,	___,	base,	___) \
+  \
+  /* Upvalue and function ops. */ \
+  _(UGET,	dst,	___,	uv,	___) \
+  _(USETV,	uv,	___,	var,	___) \
+  _(USETS,	uv,	___,	str,	___) \
+  _(USETN,	uv,	___,	num,	___) \
+  _(USETP,	uv,	___,	pri,	___) \
+  _(UCLO,	rbase,	___,	jump,	___) \
+  _(FNEW,	dst,	___,	func,	gc) \
+  \
+  /* Table ops. */ \
+  _(TNEW,	dst,	___,	lit,	gc) \
+  _(TDUP,	dst,	___,	tab,	gc) \
+  _(GGET,	dst,	___,	str,	index) \
+  _(GSET,	var,	___,	str,	newindex) \
+  _(TGETV,	dst,	var,	var,	index) \
+  _(TGETS,	dst,	var,	str,	index) \
+  _(TGETB,	dst,	var,	lit,	index) \
+  _(TSETV,	var,	var,	var,	newindex) \
+  _(TSETS,	var,	var,	str,	newindex) \
+  _(TSETB,	var,	var,	lit,	newindex) \
+  _(TSETM,	base,	___,	num,	newindex) \
+  \
+  /* Calls and vararg handling. T = tail call. */ \
+  _(CALLM,	base,	lit,	lit,	call) \
+  _(CALL,	base,	lit,	lit,	call) \
+  _(CALLMT,	base,	___,	lit,	call) \
+  _(CALLT,	base,	___,	lit,	call) \
+  _(ITERC,	base,	lit,	lit,	call) \
+  _(VARG,	base,	lit,	lit,	___) \
+  \
+  /* Returns. */ \
+  _(RETM,	base,	___,	lit,	___) \
+  _(RET,	rbase,	___,	lit,	___) \
+  _(RET0,	rbase,	___,	lit,	___) \
+  _(RET1,	rbase,	___,	lit,	___) \
+  \
+  /* Loops and branches. I/J = interp/JIT, I/C/L = init/call/loop. */ \
+  _(FORI,	base,	___,	jump,	___) \
+  _(JFORI,	base,	___,	jump,	___) \
+  \
+  _(FORL,	base,	___,	jump,	___) \
+  _(IFORL,	base,	___,	jump,	___) \
+  _(JFORL,	base,	___,	lit,	___) \
+  \
+  _(ITERL,	base,	___,	jump,	___) \
+  _(IITERL,	base,	___,	jump,	___) \
+  _(JITERL,	base,	___,	lit,	___) \
+  \
+  _(LOOP,	rbase,	___,	jump,	___) \
+  _(ILOOP,	rbase,	___,	jump,	___) \
+  _(JLOOP,	rbase,	___,	lit,	___) \
+  \
+  _(JMP,	rbase,	___,	jump,	___)
+
+/* Bytecode opcode numbers. */
+typedef enum {
+#define BCENUM(name, ma, mb, mc, mt)	BC_##name,
+BCDEF(BCENUM)
+#undef BCENUM
+  BC__MAX
+} BCOp;
+
+LJ_STATIC_ASSERT((int)BC_ISEQV+1 == (int)BC_ISNEV);
+LJ_STATIC_ASSERT(((int)BC_ISEQV^1) == (int)BC_ISNEV);
+LJ_STATIC_ASSERT(((int)BC_ISEQS^1) == (int)BC_ISNES);
+LJ_STATIC_ASSERT(((int)BC_ISEQN^1) == (int)BC_ISNEN);
+LJ_STATIC_ASSERT(((int)BC_ISEQP^1) == (int)BC_ISNEP);
+LJ_STATIC_ASSERT(((int)BC_ISLT^1) == (int)BC_ISGE);
+LJ_STATIC_ASSERT(((int)BC_ISLE^1) == (int)BC_ISGT);
+LJ_STATIC_ASSERT(((int)BC_ISLT^3) == (int)BC_ISGT);
+LJ_STATIC_ASSERT((int)BC_IST-(int)BC_ISTC == (int)BC_ISF-(int)BC_ISFC);
+LJ_STATIC_ASSERT((int)BC_CALLT-(int)BC_CALL == (int)BC_CALLMT-(int)BC_CALLM);
+LJ_STATIC_ASSERT((int)BC_CALLMT + 1 == (int)BC_CALLT);
+LJ_STATIC_ASSERT((int)BC_RETM + 1 == (int)BC_RET);
+LJ_STATIC_ASSERT((int)BC_FORL + 1 == (int)BC_IFORL);
+LJ_STATIC_ASSERT((int)BC_FORL + 2 == (int)BC_JFORL);
+LJ_STATIC_ASSERT((int)BC_ITERL + 1 == (int)BC_IITERL);
+LJ_STATIC_ASSERT((int)BC_ITERL + 2 == (int)BC_JITERL);
+LJ_STATIC_ASSERT((int)BC_LOOP + 1 == (int)BC_ILOOP);
+LJ_STATIC_ASSERT((int)BC_LOOP + 2 == (int)BC_JLOOP);
+
+/* Stack slots used by FORI/FORL, relative to operand A. */
+enum {
+  FORL_IDX, FORL_STOP, FORL_STEP, FORL_EXT
+};
+
+/* Bytecode operand modes. ORDER BCMode */
+typedef enum {
+  BCMnone, BCMdst, BCMbase, BCMvar, BCMrbase, BCMuv,  /* Mode A must be <= 7 */
+  BCMlit, BCMlits, BCMpri, BCMnum, BCMstr, BCMtab, BCMfunc, BCMjump,
+  BCM_max
+} BCMode;
+#define BCM___		BCMnone
+
+#define bcmode_a(op)	(cast(BCMode, lj_bc_mode[op] & 7))
+#define bcmode_b(op)	(cast(BCMode, (lj_bc_mode[op]>>3) & 15))
+#define bcmode_c(op)	(cast(BCMode, (lj_bc_mode[op]>>7) & 15))
+#define bcmode_d(op)	bcmode_c(op)
+#define bcmode_hasd(op)	((lj_bc_mode[op] & (15<<3)) == (BCMnone<<3))
+#define bcmode_mm(op)	(cast(MMS, lj_bc_mode[op]>>11))
+
+#define BCMODE(name, ma, mb, mc, mm) \
+  (BCM##ma|(BCM##mb<<3)|(BCM##mc<<7)|(MM_##mm<<11)),
+
+LJ_DATA const uint16_t lj_bc_mode[BC__MAX+1];
+
+#endif

+ 44 - 0
src/lj_ctype.c

@@ -0,0 +1,44 @@
+/*
+** Internal CTYPE replacement.
+** Donated to the public domain.
+**
+** This is intended to replace the problematic libc single-byte NLS functions.
+** These just don't make sense anymore with UTF-8 locales becoming the norm
+** on POSIX systems. It never worked too well on Windows systems since hardly
+** anyone bothered to call setlocale().
+**
+** Instead this table is hardcoded for ASCII, except for identifiers. These
+** include the characters 128-255, too. This allows for the use of all
+** non-ASCII chars as identifiers in the lexer. This is a broad definition,
+** but works well in practice for both UTF-8 locales and most single-byte
+** locales (such as ISO-8859-*).
+**
+** If you really need proper ctypes for UTF-8 strings, please use an add-on
+** library such as slnunicode: http://luaforge.net/projects/sln/
+*/
+
+#define lj_ctype_c
+#define LUA_CORE
+
+#include "lj_ctype.h"
+
+LJ_DATADEF const uint8_t lj_ctype_bits[257] = {
+    0,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  3,  3,  3,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+  152,152,152,152,152,152,152,152,152,152,  4,  4,  4,  4,  4,  4,
+    4,176,176,176,176,176,176,160,160,160,160,160,160,160,160,160,
+  160,160,160,160,160,160,160,160,160,160,160,  4,  4,  4,  4,132,
+    4,208,208,208,208,208,208,192,192,192,192,192,192,192,192,192,
+  192,192,192,192,192,192,192,192,192,192,192,  4,  4,  4,  4,  1,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
+  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+

+ 40 - 0
src/lj_ctype.h

@@ -0,0 +1,40 @@
+/*
+** Internal CTYPE replacement.
+** Donated to the public domain.
+*/
+
+#ifndef _LJ_CTYPE_H
+#define _LJ_CTYPE_H
+
+#include "lj_def.h"
+
+#define LJ_CTYPE_CNTRL	0x01
+#define LJ_CTYPE_SPACE	0x02
+#define LJ_CTYPE_PUNCT	0x04
+#define LJ_CTYPE_DIGIT	0x08
+#define LJ_CTYPE_XDIGIT	0x10
+#define LJ_CTYPE_UPPER	0x20
+#define LJ_CTYPE_LOWER	0x40
+#define LJ_CTYPE_IDENT	0x80
+#define LJ_CTYPE_ALPHA	(LJ_CTYPE_LOWER|LJ_CTYPE_UPPER)
+#define LJ_CTYPE_ALNUM	(LJ_CTYPE_ALPHA|LJ_CTYPE_DIGIT)
+
+/* Only pass -1 or 0..255 to these macros. Never pass a signed char! */
+#define lj_ctype_isa(c, t)	(lj_ctype_bits[(c)+1] & t)
+#define lj_ctype_iscntrl(c)	lj_ctype_isa((c), LJ_CTYPE_CNTRL)
+#define lj_ctype_isspace(c)	lj_ctype_isa((c), LJ_CTYPE_SPACE)
+#define lj_ctype_ispunct(c)	lj_ctype_isa((c), LJ_CTYPE_PUNCT)
+#define lj_ctype_isdigit(c)	lj_ctype_isa((c), LJ_CTYPE_DIGIT)
+#define lj_ctype_isxdigit(c)	lj_ctype_isa((c), LJ_CTYPE_XDIGIT)
+#define lj_ctype_isupper(c)	lj_ctype_isa((c), LJ_CTYPE_UPPER)
+#define lj_ctype_islower(c)	lj_ctype_isa((c), LJ_CTYPE_LOWER)
+#define lj_ctype_isident(c)	lj_ctype_isa((c), LJ_CTYPE_IDENT)
+#define lj_ctype_isalpha(c)	lj_ctype_isa((c), LJ_CTYPE_ALPHA)
+#define lj_ctype_isalnum(c)	lj_ctype_isa((c), LJ_CTYPE_ALNUM)
+
+#define lj_ctype_toupper(c)	((c) - (lj_ctype_islower(c) >> 1))
+#define lj_ctype_tolower(c)	((c) + lj_ctype_isupper(c))
+
+LJ_DATA const uint8_t lj_ctype_bits[257];
+
+#endif

+ 226 - 0
src/lj_def.h

@@ -0,0 +1,226 @@
+/*
+** LuaJIT common internal definitions.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_DEF_H
+#define _LJ_DEF_H
+
+#include "lua.h"
+
+#ifdef _MSC_VER
+/* MSVC is stuck in the last century and doesn't have C99's stdint.h. */
+typedef __int8 int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#ifdef _WIN64
+typedef __int64 intptr_t;
+typedef unsigned __int64 uintptr_t;
+#else
+typedef __int32 intptr_t;
+typedef unsigned __int32 uintptr_t;
+#endif
+#else
+#include <stdint.h>
+#endif
+
+/* Needed everywhere. */
+#include <string.h>
+#include <stdlib.h>
+
+/* Various VM limits. */
+#define LJ_MAX_MEM	0x7fffff00	/* Max. total memory allocation. */
+#define LJ_MAX_ALLOC	LJ_MAX_MEM	/* Max. individual allocation length. */
+#define LJ_MAX_STR	LJ_MAX_MEM	/* Max. string length. */
+#define LJ_MAX_UDATA	LJ_MAX_MEM	/* Max. userdata length. */
+
+#define LJ_MAX_STRTAB	(1<<26)		/* Max. string table size. */
+#define LJ_MAX_HBITS	26		/* Max. hash bits. */
+#define LJ_MAX_ABITS	28		/* Max. bits of array key. */
+#define LJ_MAX_ASIZE	((1<<(LJ_MAX_ABITS-1))+1)  /* Max. array part size. */
+#define LJ_MAX_COLOSIZE	16		/* Max. elems for colocated array. */
+
+#define LJ_MAX_LINE	LJ_MAX_MEM	/* Max. source code line number. */
+#define LJ_MAX_XLEVEL	200		/* Max. syntactic nesting level. */
+#define LJ_MAX_BCINS	(1<<26)		/* Max. # of bytecode instructions. */
+#define LJ_MAX_SLOTS	250		/* Max. # of slots in a Lua func. */
+#define LJ_MAX_LOCVAR	200		/* Max. # of local variables. */
+#define LJ_MAX_UPVAL	60		/* Max. # of upvalues. */
+
+#define LJ_MAX_IDXCHAIN	100		/* __index/__newindex chain limit. */
+#define LJ_STACK_EXTRA	5		/* Extra stack space (metamethods). */
+
+/* Minimum table/buffer sizes. */
+#define LJ_MIN_GLOBAL	6		/* Min. global table size (hbits). */
+#define LJ_MIN_REGISTRY	2		/* Min. registry size (hbits). */
+#define LJ_MIN_STRTAB	256		/* Min. string table size (pow2). */
+#define LJ_MIN_SBUF	32		/* Min. string buffer length. */
+#define LJ_MIN_VECSZ	8		/* Min. size for growable vectors. */
+#define LJ_MIN_IRSZ	32		/* Min. size for growable IR. */
+#define LJ_MIN_KNUMSZ	16		/* Min. size for chained KNUM array. */
+
+/* JIT compiler limits. */
+#define LJ_MAX_JSLOTS	250		/* Max. # of stack slots for a trace. */
+#define LJ_MAX_PHI	32		/* Max. # of PHIs for a loop. */
+#define LJ_MAX_EXITSTUBGR	8	/* Max. # of exit stub groups. */
+
+/* Various macros. */
+#ifndef UNUSED
+#define UNUSED(x)	((void)(x))	/* to avoid warnings */
+#endif
+
+#ifndef cast
+#define cast(t, exp)	((t)(exp))
+#endif
+
+#define U64x(hi, lo)	(((uint64_t)0x##hi << 32) + (uint64_t)0x##lo)
+#define cast_byte(i)	cast(uint8_t, (i))
+#define cast_num(i)	cast(lua_Number, (i))
+#define cast_int(i)	cast(int, (i))
+#define i32ptr(p)	((int32_t)(intptr_t)(void *)(p))
+#define u32ptr(p)	((uint32_t)(intptr_t)(void *)(p))
+
+#define checki8(x)	((x) == (int32_t)(int8_t)(x))
+#define checku8(x)	((x) == (int32_t)(uint8_t)(x))
+#define checki16(x)	((x) == (int32_t)(int16_t)(x))
+
+/* Every half-decent C compiler transforms this into a rotate instruction. */
+#define lj_rol(x, n)	(((x)<<(n)) | ((x)>>(32-(n))))
+#define lj_ror(x, n)	(((x)<<(32-(n))) | ((x)>>(n)))
+
+/* A really naive Bloom filter. But sufficient for our needs. */
+typedef uintptr_t BloomFilter;
+#define BLOOM_MASK	(8*sizeof(BloomFilter) - 1)
+#define bloombit(x)	((uintptr_t)1 << ((x) & BLOOM_MASK))
+#define bloomset(b, x)	((b) |= bloombit((x)))
+#define bloomtest(b, x)	((b) & bloombit((x)))
+
+#if defined(__GNUC__)
+
+#if (__GNUC__ < 3) || ((__GNUC__ == 3) && __GNUC_MINOR__ < 4)
+#error "sorry, need GCC 3.4 or newer"
+#endif
+
+#define LJ_NORET	__attribute__((noreturn))
+#define LJ_ALIGN(n)	__attribute__((aligned(n)))
+#define LJ_INLINE	inline
+#define LJ_AINLINE	inline __attribute__((always_inline))
+#define LJ_NOINLINE	__attribute__((noinline))
+
+#if defined(__ELF__) || defined(__MACH__)
+#define LJ_NOAPI	extern __attribute__((visibility("hidden")))
+#endif
+
+/* Note: it's only beneficial to use fastcall on x86 and then only for up to
+** two non-FP args. The amalgamated compile covers all LJ_FUNC cases. Only
+** indirect calls and related tail-called C functions are marked as fastcall.
+*/
+#if defined(__i386__)
+#define LJ_FASTCALL	__attribute__((fastcall))
+#endif
+
+#define LJ_LIKELY(x)	__builtin_expect(!!(x), 1)
+#define LJ_UNLIKELY(x)	__builtin_expect(!!(x), 0)
+
+#define lj_ffs(x)	((uint32_t)__builtin_ctz(x))
+/* Don't ask ... */
+#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__))
+static LJ_AINLINE uint32_t lj_fls(uint32_t x)
+{
+  uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r;
+}
+#else
+#define lj_fls(x)	((uint32_t)(__builtin_clz(x)^31))
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+static LJ_AINLINE uint32_t lj_bswap(uint32_t x)
+{
+  uint32_t r; __asm__("bswap %0" : "=r" (r) : "0" (x)); return r;
+}
+#else
+#error "missing define for lj_bswap()"
+#endif
+
+#elif defined(_MSC_VER)
+
+#define LJ_NORET	__declspec(noreturn)
+#define LJ_ALIGN(n)	__declspec(align(n))
+#define LJ_INLINE	__inline
+#define LJ_AINLINE	__forceinline
+#define LJ_NOINLINE	__declspec(noinline)
+#if defined(_M_IX86)
+#define LJ_FASTCALL	__fastcall
+#endif
+
+static LJ_AINLINE uint32_t lj_ffs(uint32_t x)
+{
+  uint32_t r; _BitScanForward(&r, x); return r;
+}
+
+static LJ_AINLINE uint32_t lj_fls(uint32_t x)
+{
+  uint32_t r; _BitScanReverse(&r, x); return r;
+}
+
+#define lj_bswap(x)	(_byteswap_ulong((x)))
+
+#else
+#error "missing defines for your compiler"
+#endif
+
+/* Optional defines. */
+#ifndef LJ_FASTCALL
+#define LJ_FASTCALL
+#endif
+#ifndef LJ_NORET
+#define LJ_NORET
+#endif
+#ifndef LJ_NOAPI
+#define LJ_NOAPI	extern
+#endif
+#ifndef LJ_LIKELY
+#define LJ_LIKELY(x)	(x)
+#define LJ_UNLIKELY(x)	(x)
+#endif
+
+/* Attributes for internal functions. */
+#if defined(ljamalg_c)
+#define LJ_DATA		static
+#define LJ_DATADEF	static
+#define LJ_FUNC		static
+#define LJ_ASMF		LJ_NOAPI
+#define LJ_FUNCA	LJ_NOAPI
+#else
+#define LJ_DATA		LJ_NOAPI
+#define LJ_DATADEF
+#define LJ_FUNC		LJ_NOAPI
+#define LJ_ASMF		LJ_NOAPI
+#define LJ_FUNCA	LJ_NOAPI
+#endif
+#define LJ_FUNC_NORET	LJ_FUNC LJ_NORET
+#define LJ_FUNCA_NORET	LJ_FUNCA LJ_NORET
+#define LJ_ASMF_NORET	LJ_ASMF LJ_NORET
+
+/* Runtime assertions. */
+#ifdef lua_assert
+#define check_exp(c, e)		(lua_assert(c), (e))
+#define api_check(l, e)		lua_assert(e)
+#else
+#define lua_assert(c)		((void)0)
+#define check_exp(c, e)		(e)
+#define api_check		luai_apicheck
+#endif
+
+/* Static assertions. */
+#define LJ_ASSERT_NAME2(name, line)	name ## line
+#define LJ_ASSERT_NAME(line)		LJ_ASSERT_NAME2(lj_assert_, line)
+#define LJ_STATIC_ASSERT(cond) \
+  extern void LJ_ASSERT_NAME(__LINE__)(int STATIC_ASSERTION_FAILED[(cond)?1:-1])
+
+#endif

+ 284 - 0
src/lj_dispatch.c

@@ -0,0 +1,284 @@
+/*
+** Instruction dispatch handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_dispatch_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_bc.h"
+#if LJ_HASJIT
+#include "lj_jit.h"
+#endif
+#include "lj_trace.h"
+#include "lj_dispatch.h"
+#include "lj_vm.h"
+#include "luajit.h"
+
+/* -- Dispatch table management ------------------------------------------- */
+
+/* Initialize instruction dispatch table and hot counters. */
+void lj_dispatch_init(GG_State *GG)
+{
+  uint32_t i;
+  ASMFunction *disp = GG->dispatch;
+  for (i = 0; i < BC__MAX; i++)
+    disp[GG_DISP_STATIC+i] = disp[i] = makeasmfunc(lj_vm_op_ofs[i]);
+  /* The JIT engine is off by default. luaopen_jit() turns it on. */
+  disp[BC_FORL] = disp[BC_IFORL];
+  disp[BC_ITERL] = disp[BC_IITERL];
+  disp[BC_LOOP] = disp[BC_ILOOP];
+}
+
+/* Update dispatch table depending on various flags. */
+void lj_dispatch_update(global_State *g)
+{
+  uint8_t oldmode = g->dispatchmode;
+  uint8_t mode = 0;
+#if LJ_HASJIT
+  mode |= (G2J(g)->flags & JIT_F_ON) ? 1 : 0;
+  mode |= G2J(g)->state != LJ_TRACE_IDLE ? 6 : 0;
+#endif
+  mode |= (g->hookmask & HOOK_EVENTMASK) ? 2 : 0;
+  if (oldmode != mode) {  /* Mode changed? */
+    ASMFunction *disp = G2GG(g)->dispatch;
+    ASMFunction f_forl, f_iterl, f_loop;
+    g->dispatchmode = mode;
+    if ((mode & 5) == 1) {  /* Hotcount if JIT is on, but not when recording. */
+      f_forl = makeasmfunc(lj_vm_op_ofs[BC_FORL]);
+      f_iterl = makeasmfunc(lj_vm_op_ofs[BC_ITERL]);
+      f_loop = makeasmfunc(lj_vm_op_ofs[BC_LOOP]);
+    } else {  /* Otherwise use the non-hotcounting instructions. */
+      f_forl = disp[GG_DISP_STATIC+BC_IFORL];
+      f_iterl = disp[GG_DISP_STATIC+BC_IITERL];
+      f_loop = disp[GG_DISP_STATIC+BC_ILOOP];
+    }
+    /* Set static loop ins first (may be copied below). */
+    disp[GG_DISP_STATIC+BC_FORL] = f_forl;
+    disp[GG_DISP_STATIC+BC_ITERL] = f_iterl;
+    disp[GG_DISP_STATIC+BC_LOOP] = f_loop;
+    if ((oldmode & 6) != (mode & 6)) {  /* Need to change whole table? */
+      if ((mode & 6) == 0) {  /* No hooks and no recording? */
+	/* Copy static dispatch table to dynamic dispatch table. */
+	memcpy(&disp[0], &disp[GG_DISP_STATIC], sizeof(ASMFunction)*BC__MAX);
+      } else {
+	/* The recording dispatch also checks for hooks. */
+	ASMFunction f = (mode & 6) == 6 ? lj_vm_record : lj_vm_hook;
+	uint32_t i;
+	for (i = 0; i < BC__MAX; i++)
+	  disp[i] = f;
+      }
+    } else if ((mode & 6) == 0) {  /* Fix dynamic loop ins unless overriden. */
+      disp[BC_FORL] = f_forl;
+      disp[BC_ITERL] = f_iterl;
+      disp[BC_LOOP] = f_loop;
+    }
+  }
+}
+
+/* -- JIT mode setting ---------------------------------------------------- */
+
+#if LJ_HASJIT
+/* Set JIT mode for a single prototype. */
+static void setptmode(global_State *g, GCproto *pt, int mode)
+{
+  if ((mode & LUAJIT_MODE_ON)) {  /* (Re-)enable JIT compilation. */
+    pt->flags &= ~PROTO_NO_JIT;
+    lj_trace_reenableproto(pt);  /* Unpatch all ILOOP etc. bytecodes. */
+  } else {  /* Flush and/or disable JIT compilation. */
+    if (!(mode & LUAJIT_MODE_FLUSH))
+      pt->flags |= PROTO_NO_JIT;
+    lj_trace_flushproto(g, pt);  /* Flush all traces of prototype. */
+  }
+}
+
+/* Recursively set the JIT mode for all children of a prototype. */
+static void setptmode_all(global_State *g, GCproto *pt, int mode)
+{
+  ptrdiff_t i;
+  for (i = -(ptrdiff_t)pt->sizekgc; i < 0; i++) {
+    GCobj *o = gcref(pt->k.gc[i]);
+    if (o->gch.gct == ~LJ_TPROTO) {
+      setptmode(g, gco2pt(o), mode);
+      setptmode_all(g, gco2pt(o), mode);
+    }
+  }
+}
+#endif
+
+/* Public API function: control the JIT engine. */
+int luaJIT_setmode(lua_State *L, int idx, int mode)
+{
+  global_State *g = G(L);
+  int mm = mode & LUAJIT_MODE_MASK;
+  lj_trace_abort(g);  /* Abort recording on any state change. */
+  /* Avoid pulling the rug from under our own feet. */
+  if ((g->hookmask & HOOK_GC))
+    lj_err_caller(L, LJ_ERR_NOGCMM);
+  switch (mm) {
+#if LJ_HASJIT
+  case LUAJIT_MODE_ENGINE:
+    if ((mode & LUAJIT_MODE_FLUSH)) {
+      lj_trace_flushall(L);
+    } else {
+      if ((mode & LUAJIT_MODE_ON))
+	G2J(g)->flags |= (uint32_t)JIT_F_ON;
+      else
+	G2J(g)->flags &= ~(uint32_t)JIT_F_ON;
+      lj_dispatch_update(g);
+    }
+    break;
+  case LUAJIT_MODE_FUNC:
+  case LUAJIT_MODE_ALLFUNC:
+  case LUAJIT_MODE_ALLSUBFUNC: {
+    cTValue *tv = idx == 0 ? frame_prev(L->base-1) :
+		  idx > 0 ? L->base + (idx-1) : L->top + idx;
+    GCproto *pt;
+    if ((idx == 0 || tvisfunc(tv)) && isluafunc(&gcval(tv)->fn))
+      pt = funcproto(&gcval(tv)->fn);  /* Cannot use funcV() for frame slot. */
+    else if (tvisproto(tv))
+      pt = protoV(tv);
+    else
+      return 0;  /* Failed. */
+    if (mm != LUAJIT_MODE_ALLSUBFUNC)
+      setptmode(g, pt, mode);
+    if (mm != LUAJIT_MODE_FUNC)
+      setptmode_all(g, pt, mode);
+    break;
+    }
+  case LUAJIT_MODE_TRACE:
+    if (!(mode & LUAJIT_MODE_FLUSH))
+      return 0;  /* Failed. */
+    lj_trace_flush(G2J(g), idx);
+    break;
+#else
+  case LUAJIT_MODE_ENGINE:
+  case LUAJIT_MODE_FUNC:
+  case LUAJIT_MODE_ALLFUNC:
+  case LUAJIT_MODE_ALLSUBFUNC:
+    UNUSED(idx);
+    if ((mode & LUAJIT_MODE_ON))
+      return 0;  /* Failed. */
+    break;
+#endif
+  default:
+    return 0;  /* Failed. */
+  }
+  return 1;  /* OK. */
+}
+
+/* Enforce (dynamic) linker error for version mismatches. See luajit.c. */
+LUA_API void LUAJIT_VERSION_SYM(void)
+{
+}
+
+/* -- Hooks --------------------------------------------------------------- */
+
+/* This function can be called asynchronously (e.g. during a signal). */
+LUA_API int lua_sethook(lua_State *L, lua_Hook func, int mask, int count)
+{
+  global_State *g = G(L);
+  mask &= HOOK_EVENTMASK;
+  if (func == NULL || mask == 0) { mask = 0; func = NULL; }  /* Consistency. */
+  g->hookf = func;
+  g->hookcount = g->hookcstart = (int32_t)count;
+  g->hookmask = (uint8_t)((g->hookmask & ~HOOK_EVENTMASK) | mask);
+  lj_trace_abort(g);  /* Abort recording on any hook change. */
+  lj_dispatch_update(g);
+  return 1;
+}
+
+LUA_API lua_Hook lua_gethook(lua_State *L)
+{
+  return G(L)->hookf;
+}
+
+LUA_API int lua_gethookmask(lua_State *L)
+{
+  return G(L)->hookmask & HOOK_EVENTMASK;
+}
+
+LUA_API int lua_gethookcount(lua_State *L)
+{
+  return (int)G(L)->hookcstart;
+}
+
+/* Call a hook. */
+static void callhook(lua_State *L, int event, BCLine line)
+{
+  global_State *g = G(L);
+  lua_Hook hookf = g->hookf;
+  if (hookf && !hook_active(g)) {
+    lua_Debug ar;
+    lj_trace_abort(g);  /* Abort recording on any hook call. */
+    ar.event = event;
+    ar.currentline = line;
+    ar.i_ci = cast_int((L->base-1) - L->stack); /* Top frame, nextframe=NULL. */
+    lj_state_checkstack(L, 1+LUA_MINSTACK);
+    hook_enter(g);
+    hookf(L, &ar);
+    lua_assert(hook_active(g));
+    hook_leave(g);
+  }
+}
+
+/* -- Instruction dispatch callbacks -------------------------------------- */
+
+/* Calculate number of used stack slots in the current frame. */
+static BCReg cur_topslot(GCproto *pt, const BCIns *pc, uint32_t nres)
+{
+  BCIns ins = pc[-1];
+  for (;;) {
+    switch (bc_op(ins)) {
+    case BC_UCLO: ins = pc[bc_j(ins)]; break;
+    case BC_CALLM:
+    case BC_CALLMT: return bc_a(ins) + bc_c(ins) + nres-1+1;
+    case BC_RETM: return bc_a(ins) + bc_d(ins) + nres-1;
+    case BC_TSETM: return bc_a(ins) + nres-1;
+    default: return pt->framesize;
+    }
+  }
+}
+
+/* Instruction dispatch callback for instr/line hooks or when recording. */
+void lj_dispatch_ins(lua_State *L, const BCIns *pc, uint32_t nres)
+{
+  GCfunc *fn = curr_func(L);
+  GCproto *pt = funcproto(fn);
+  BCReg slots = cur_topslot(pt, pc, nres);
+  global_State *g = G(L);
+  const BCIns *oldpc = cframe_Lpc(L);
+  cframe_Lpc(L) = pc;
+  L->top = L->base + slots;  /* Fix top. */
+#if LJ_HASJIT
+  {
+    jit_State *J = G2J(g);
+    if (J->state != LJ_TRACE_IDLE) {
+      J->L = L;
+      J->pc = pc-1;
+      J->fn = fn;
+      J->pt = pt;
+      lj_trace_ins(J);
+    }
+  }
+#endif
+  if ((g->hookmask & LUA_MASKCOUNT) && g->hookcount == 0) {
+    g->hookcount = g->hookcstart;
+    callhook(L, LUA_HOOKCOUNT, -1);
+  }
+  if ((g->hookmask & LUA_MASKLINE) && pt->lineinfo) {
+    BCPos npc = (BCPos)(pc - pt->bc)-1;
+    BCPos opc = (BCPos)(oldpc - pt->bc)-1;
+    BCLine line = pt->lineinfo[npc];
+    if (npc == 0 || pc <= oldpc ||
+	opc >= pt->sizebc || line != pt->lineinfo[opc]) {
+      L->top = L->base + slots;  /* Fix top again after instruction hook. */
+      callhook(L, LUA_HOOKLINE, line);
+    }
+  }
+}
+

+ 64 - 0
src/lj_dispatch.h

@@ -0,0 +1,64 @@
+/*
+** Instruction dispatch handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_DISPATCH_H
+#define _LJ_DISPATCH_H
+
+#include "lj_obj.h"
+#include "lj_bc.h"
+#if LJ_HASJIT
+#include "lj_jit.h"
+#endif
+
+/* Type of hot counter. Must match the code in the assembler VM. */
+/* 16 bits are sufficient. Only 0.0015% overhead with maximum slot penalty. */
+typedef uint16_t HotCount;
+
+/* Number of hot counter hash table entries (must be a power of two). */
+#define HOTCOUNT_SIZE		64
+#define HOTCOUNT_PCMASK		((HOTCOUNT_SIZE-1)*sizeof(HotCount))
+#define HOTCOUNT_MIN_PENALTY	103
+#define HOTCOUNT_MAX_PENALTY	60000
+
+/* Global state, main thread and extra fields are allocated together. */
+typedef struct GG_State {
+  lua_State L;				/* Main thread. */
+  global_State g;			/* Global state. */
+#if LJ_HASJIT
+  jit_State J;				/* JIT state. */
+  HotCount hotcount[HOTCOUNT_SIZE];	/* Hot counters. */
+#endif
+  ASMFunction dispatch[2*BC__MAX];	/* Instruction dispatch tables. */
+} GG_State;
+
+#define GG_DISP_STATIC	BC__MAX
+
+#define GG_OFS(field)	((int)offsetof(GG_State, field))
+#define G2GG(gl) \
+  ((GG_State *)(((char *)(gl))-((char *)(&((GG_State *)0)->g))))
+#define J2GG(j) \
+  ((GG_State *)(((char *)(j))-((char *)(&((GG_State *)0)->J))))
+#define L2GG(L)		G2GG(G(L))
+#define J2G(J)		(&J2GG(J)->g)
+#define G2J(gl)		(&G2GG(gl)->J)
+#define L2J(L)		(&L2GG(L)->J)
+#define GG_G2DISP	(GG_OFS(dispatch) - GG_OFS(g))
+#define GG_DISP2G	(GG_OFS(g) - GG_OFS(dispatch))
+#define GG_DISP2J	(GG_OFS(J) - GG_OFS(dispatch))
+#define GG_DISP2HOT	(GG_OFS(hotcount) - GG_OFS(dispatch))
+
+#define hotcount_get(gg, pc) \
+  (gg)->hotcount[(u32ptr(pc)>>2) & (HOTCOUNT_SIZE-1)]
+#define hotcount_set(gg, pc, val) \
+  (hotcount_get((gg), (pc)) = (HotCount)(val))
+
+/* Dispatch table management. */
+LJ_FUNC void lj_dispatch_init(GG_State *GG);
+LJ_FUNC void lj_dispatch_update(global_State *g);
+
+/* Instruction dispatch callback for instr/line hooks or when recording. */
+LJ_FUNCA void lj_dispatch_ins(lua_State *L, const BCIns *pc, uint32_t nres);
+
+#endif

+ 763 - 0
src/lj_err.c

@@ -0,0 +1,763 @@
+/*
+** Error handling and debugging API.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_err_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_bc.h"
+#include "lj_trace.h"
+#include "lj_vm.h"
+
+/* -- Error messages ------------------------------------------------------ */
+
+/* Error message strings. */
+static const char *lj_err_allmsg =
+#define ERRDEF(name, msg)	msg "\0"
+#include "lj_errmsg.h"
+;
+
+#define err2msg(em)	(lj_err_allmsg+(int)(em))
+
+/* -- Frame and function introspection ------------------------------------ */
+
+static BCPos currentpc(lua_State *L, GCfunc *fn, cTValue *nextframe)
+{
+  const BCIns *ins;
+  lua_assert(fn->c.gct == ~LJ_TFUNC || fn->c.gct == ~LJ_TTHREAD);
+  if (!isluafunc(fn)) {  /* Cannot derive a PC for non-Lua functions. */
+    return ~(BCPos)0;
+  } else if (nextframe == NULL) {  /* Lua function on top. */
+    ins = cframe_Lpc(L);  /* Only happens during error/hook handling. */
+  } else {
+    if (frame_islua(nextframe)) {
+      ins = frame_pc(nextframe);
+    } else if (frame_iscont(nextframe)) {
+      ins = frame_contpc(nextframe);
+    } else {
+      /* Lua function below errfunc/gc/hook: find cframe to get the PC. */
+      void *cf = cframe_raw(L->cframe);
+      TValue *f = L->base-1;
+      while (f > nextframe) {
+	if (frame_islua(f)) {
+	  f = frame_prevl(f);
+	} else {
+	  if (frame_isc(f))
+	    cf = cframe_raw(cframe_prev(cf));
+	  f = frame_prevd(f);
+	}
+      }
+      if (cframe_prev(cf))
+	cf = cframe_raw(cframe_prev(cf));
+      ins = cframe_pc(cf);
+    }
+  }
+  return (BCPos)((ins - funcproto(fn)->bc) - 1);
+}
+
+static BCLine currentline(lua_State *L, GCfunc *fn, cTValue *nextframe)
+{
+  BCPos pc = currentpc(L, fn, nextframe);
+  if (pc != ~(BCPos)0) {
+    GCproto *pt = funcproto(fn);
+    lua_assert(pc < pt->sizebc);
+    return pt->lineinfo ? pt->lineinfo[pc] : 0;
+  } else {
+    return -1;
+  }
+}
+
+static const char *getvarname(const GCproto *pt, BCPos pc, BCReg slot)
+{
+  MSize i;
+  for (i = 0; i < pt->sizevarinfo && pt->varinfo[i].startpc <= pc; i++)
+    if (pc < pt->varinfo[i].endpc && slot-- == 0)
+      return strdata(pt->varinfo[i].name);
+  return NULL;
+}
+
+static const char *getobjname(GCproto *pt, const BCIns *ip, BCReg slot,
+			      const char **name)
+{
+  const char *lname;
+restart:
+  lname = getvarname(pt, (BCPos)(ip - pt->bc), slot);
+  if (lname != NULL) { *name = lname; return "local"; }
+  while (--ip >= pt->bc) {
+    BCIns ins = *ip;
+    BCOp op = bc_op(ins);
+    BCReg ra = bc_a(ins);
+    if (bcmode_a(op) == BCMbase) {
+      if (slot >= ra && (op != BC_KNIL || slot <= bc_d(ins)))
+	return NULL;
+    } else if (bcmode_a(op) == BCMdst && ra == slot) {
+      switch (bc_op(ins)) {
+      case BC_MOV:
+	if (ra == slot) { slot = bc_d(ins); goto restart; }
+	break;
+      case BC_GGET:
+	*name = strdata(gco2str(gcref(pt->k.gc[~bc_d(ins)])));
+	return "global";
+      case BC_TGETS:
+	*name = strdata(gco2str(gcref(pt->k.gc[~bc_c(ins)])));
+	if (ip > pt->bc) {
+	  BCIns insp = ip[-1];
+	  if (bc_op(insp) == BC_MOV && bc_a(insp) == ra+1 &&
+	      bc_d(insp) == bc_b(ins))
+	    return "method";
+	}
+	return "field";
+      case BC_UGET:
+	*name = pt->uvname ? strdata(pt->uvname[bc_d(ins)]) : "?";
+	return "upvalue";
+      default:
+	return NULL;
+      }
+    }
+  }
+  return NULL;
+}
+
+static const char *getfuncname(lua_State *L, TValue *frame, const char **name)
+{
+  MMS mm;
+  const BCIns *ip;
+  TValue *pframe;
+  GCfunc *fn;
+  BCPos pc;
+  if (frame_isvarg(frame))
+    frame = frame_prevd(frame);
+  pframe = frame_prev(frame);
+  fn = frame_func(pframe);
+  pc = currentpc(L, fn, frame);
+  if (pc == ~(BCPos)0)
+    return NULL;
+  lua_assert(pc < funcproto(fn)->sizebc);
+  ip = &funcproto(fn)->bc[pc];
+  mm = bcmode_mm(bc_op(*ip));
+  if (mm == MM_call) {
+    BCReg slot = bc_a(*ip);
+    if (bc_op(*ip) == BC_ITERC) slot -= 3;
+    return getobjname(funcproto(fn), ip, slot, name);
+  } else if (mm != MM_MAX) {
+    *name = strdata(strref(G(L)->mmname[mm]));
+    return "metamethod";
+  } else {
+    return NULL;
+  }
+}
+
+void lj_err_pushloc(lua_State *L, GCproto *pt, BCPos pc)
+{
+  GCstr *name = pt->chunkname;
+  if (name) {
+    const char *s = strdata(name);
+    MSize i, len = name->len;
+    BCLine line;
+    if (pc)
+      line = pt->lineinfo ? pt->lineinfo[pc-1] : 0;
+    else
+      line = pt->linedefined;
+    if (*s == '@') {
+      s++; len--;
+      for (i = len; i > 0; i--)
+	if (s[i] == '/' || s[i] == '\\') {
+	  s += i+1;
+	  break;
+	}
+      lj_str_pushf(L, "%s:%d", s, line);
+    } else if (len > 40) {
+      lj_str_pushf(L, "%p:%d", pt, line);
+    } else if (*s == '=') {
+      lj_str_pushf(L, "%s:%d", s+1, line);
+    } else {
+      lj_str_pushf(L, "\"%s\":%d", s, line);
+    }
+  } else {
+    lj_str_pushf(L, "%p:%u", pt, pc);
+  }
+}
+
+static void err_chunkid(char *out, const char *src)
+{
+  if (*src == '=') {
+    strncpy(out, src+1, LUA_IDSIZE);  /* remove first char */
+    out[LUA_IDSIZE-1] = '\0';  /* ensures null termination */
+  } else if (*src == '@') { /* out = "source", or "...source" */
+    size_t l = strlen(++src);  /* skip the `@' */
+    if (l >= LUA_IDSIZE) {
+      src += l-(LUA_IDSIZE-4);  /* get last part of file name */
+      strcpy(out, "...");
+      out += 3;
+    }
+    strcpy(out, src);
+  } else {  /* out = [string "string"] */
+    size_t len; /* Length, up to first control char. */
+    for (len = 0; len < LUA_IDSIZE-11; len++)
+      if (((const unsigned char *)src)[len] < ' ') break;
+    strcpy(out, "[string \""); out += 9;
+    if (src[len] != '\0') {  /* must truncate? */
+      if (len > LUA_IDSIZE-15) len = LUA_IDSIZE-15;
+      strncpy(out, src, len); out += len;
+      strcpy(out, "..."); out += 3;
+    } else {
+      strcpy(out, src); out += len;
+    }
+    strcpy(out, "\"]");
+  }
+}
+
+/* -- Public debug API ---------------------------------------------------- */
+
+static TValue *findlocal(lua_State *L, const lua_Debug *ar,
+			 const char **name, BCReg slot)
+{
+  uint32_t offset = (uint32_t)ar->i_ci & 0xffff;
+  uint32_t size = (uint32_t)ar->i_ci >> 16;
+  TValue *frame = L->stack + offset;
+  TValue *nextframe = size ? frame + size : NULL;
+  GCfunc *fn = frame_func(frame);
+  BCPos pc = currentpc(L, fn, nextframe);
+  if (pc != ~(BCPos)0 &&
+      (*name = getvarname(funcproto(fn), pc, slot-1)) != NULL)
+    ;
+  else if (slot > 0 && frame + slot < (nextframe ? nextframe : L->top))
+    *name = "(*temporary)";
+  else
+    *name = NULL;
+  return frame+slot;
+}
+
+LUA_API const char *lua_getlocal(lua_State *L, const lua_Debug *ar, int n)
+{
+  const char *name;
+  TValue *o = findlocal(L, ar, &name, (BCReg)n);
+  if (name) {
+    copyTV(L, L->top, o);
+    incr_top(L);
+  }
+  return name;
+}
+
+
+LUA_API const char *lua_setlocal(lua_State *L, const lua_Debug *ar, int n)
+{
+  const char *name;
+  TValue *o = findlocal(L, ar, &name, (BCReg)n);
+  if (name)
+    copyTV(L, o, L->top-1);
+  L->top--;
+  return name;
+}
+
+LUA_API int lua_getinfo(lua_State *L, const char *what, lua_Debug *ar)
+{
+  int status = 1;
+  TValue *frame = NULL;
+  TValue *nextframe = NULL;
+  GCfunc *fn;
+  if (*what == '>') {
+    TValue *func = L->top - 1;
+    api_check(L, tvisfunc(func));
+    fn = funcV(func);
+    L->top--;
+    what++;
+  } else {
+    uint32_t offset = (uint32_t)ar->i_ci & 0xffff;
+    uint32_t size = (uint32_t)ar->i_ci >> 16;
+    lua_assert(offset != 0);
+    frame = L->stack + offset;
+    if (size) nextframe = frame + size;
+    lua_assert(frame<=L->maxstack && (!nextframe || nextframe<=L->maxstack));
+    fn = frame_func(frame);
+    lua_assert(fn->c.gct == ~LJ_TFUNC);
+  }
+  for (; *what; what++) {
+    switch (*what) {
+    case 'S':
+      if (isluafunc(fn)) {
+	ar->source = strdata(funcproto(fn)->chunkname);
+	ar->linedefined = cast_int(funcproto(fn)->linedefined);
+	ar->lastlinedefined = cast_int(funcproto(fn)->lastlinedefined);
+	ar->what = (ar->linedefined == 0) ? "main" : "Lua";
+      } else {
+	ar->source = "=[C]";
+	ar->linedefined = -1;
+	ar->lastlinedefined = -1;
+	ar->what = "C";
+      }
+      err_chunkid(ar->short_src, ar->source);
+      break;
+    case 'l':
+      ar->currentline = frame ? currentline(L, fn, nextframe) : -1;
+      break;
+    case 'u':
+      ar->nups = fn->c.nupvalues;
+      break;
+    case 'n':
+      ar->namewhat = frame ? getfuncname(L, frame, &ar->name) : NULL;
+      if (ar->namewhat == NULL) {
+	ar->namewhat = "";
+	ar->name = NULL;
+      }
+      break;
+    case 'f':
+      setfuncV(L, L->top, fn);
+      incr_top(L);
+      break;
+    case 'L':
+      if (isluafunc(fn)) {
+	GCtab *t = lj_tab_new(L, 0, 0);
+	BCLine *lineinfo = funcproto(fn)->lineinfo;
+	uint32_t i, szl = funcproto(fn)->sizelineinfo;
+	for (i = 0; i < szl; i++)
+	  setboolV(lj_tab_setint(L, t, lineinfo[i]), 1);
+	settabV(L, L->top, t);
+      } else {
+	setnilV(L->top);
+      }
+      incr_top(L);
+      break;
+    default:
+      status = 0;  /* Bad option. */
+      break;
+    }
+  }
+  return status;
+}
+
+cTValue *lj_err_getframe(lua_State *L, int level, int *size)
+{
+  cTValue *frame, *nextframe;
+  /* Traverse frames backwards. */
+  for (nextframe = frame = L->base-1; frame > L->stack; ) {
+    if (frame_gc(frame) == obj2gco(L))
+      level++;  /* Skip dummy frames. See lj_meta_call(). */
+    if (level-- == 0) {
+      *size = cast_int(nextframe - frame);
+      return frame;  /* Level found. */
+    }
+    nextframe = frame;
+    if (frame_islua(frame)) {
+      frame = frame_prevl(frame);
+    } else {
+      if (frame_isvarg(frame))
+	level++;  /* Skip vararg pseudo-frame. */
+      frame = frame_prevd(frame);
+    }
+  }
+  *size = level;
+  return NULL;  /* Level not found. */
+}
+
+LUA_API int lua_getstack(lua_State *L, int level, lua_Debug *ar)
+{
+  int size;
+  cTValue *frame = lj_err_getframe(L, level, &size);
+  if (frame) {
+    ar->i_ci = (size << 16) + cast_int(frame - L->stack);
+    return 1;
+  } else {
+    ar->i_ci = level - size;
+    return 0;
+  }
+}
+
+/* -- Error handling ------------------------------------------------------ */
+
+/* Return string object for error message. */
+LJ_NOINLINE GCstr *lj_err_str(lua_State *L, ErrMsg em)
+{
+  return lj_str_newz(L, err2msg(em));
+}
+
+/* Unwind Lua stack and add error message on top. */
+LJ_NOINLINE static void unwindstack(lua_State *L, TValue *top, int errcode)
+{
+  lj_func_closeuv(L, top);
+  switch (errcode) {
+  case LUA_ERRMEM:
+    setstrV(L, top, lj_err_str(L, LJ_ERR_ERRMEM));
+    break;
+  case LUA_ERRERR:
+    setstrV(L, top, lj_err_str(L, LJ_ERR_ERRERR));
+    break;
+  case LUA_ERRSYNTAX:
+  case LUA_ERRRUN:
+    copyTV(L, top, L->top - 1);
+    break;
+  default:
+    lua_assert(0);
+    break;
+  }
+  L->top = top+1;
+  lj_state_relimitstack(L);
+}
+
+/* Throw error. Find catch frame, unwind stack and continue. */
+LJ_NOINLINE void lj_err_throw(lua_State *L, int errcode)
+{
+  TValue *frame = L->base-1;
+  void *cf = L->cframe;
+  global_State *g = G(L);
+  if (L->status == LUA_ERRERR+1) {  /* Don't touch the stack during lua_open. */
+    lj_vm_unwind_c(cf, errcode);
+    goto uncaught;  /* unreachable */
+  }
+  lj_trace_abort(g);
+  setgcrefnull(g->jit_L);
+  L->status = 0;
+  while (cf) {
+    if (cframe_nres(cframe_raw(cf)) < 0) {  /* cframe without frame? */
+      TValue *top = restorestack(L, -cframe_nres(cf));
+      if (frame < top) {
+	L->cframe = cframe_prev(cf);
+	L->base = frame+1;
+	unwindstack(L, top, errcode);
+	lj_vm_unwind_c(cf, errcode);
+	goto uncaught;  /* unreachable */
+      }
+    }
+    if (frame <= L->stack)
+      break;
+    switch (frame_typep(frame)) {
+    case FRAME_LUA:
+    case FRAME_LUAP:
+      frame = frame_prevl(frame);
+      break;
+    case FRAME_C:
+      if (cframe_canyield(cf)) goto uncaught;
+      cf = cframe_prev(cf);
+      /* fallthrough */
+    case FRAME_CONT:
+    case FRAME_VARG:
+      frame = frame_prevd(frame);
+      break;
+    case FRAME_CP:
+      L->cframe = cframe_prev(cf);
+      L->base = frame_prevd(frame) + 1;
+      unwindstack(L, frame, errcode);
+      lj_vm_unwind_c(cf, errcode);
+      goto uncaught;  /* unreachable */
+    case FRAME_PCALL:
+      hook_leave(g);
+      /* fallthrough */
+    case FRAME_PCALLH:
+      L->cframe = cf;
+      L->base = frame_prevd(frame) + 1;
+      unwindstack(L, L->base, errcode);
+      lj_vm_unwind_ff(cf);
+      goto uncaught;  /* unreachable */
+    default:
+      lua_assert(0);
+      goto uncaught;
+    }
+  }
+  /* No catch frame found. Must be a resume or an unprotected error. */
+uncaught:
+  L->status = cast_byte(errcode);
+  L->cframe = NULL;
+  if (cframe_canyield(cf)) {  /* Resume? */
+    unwindstack(L, L->top, errcode);
+    lj_vm_unwind_c(cf, errcode);
+  }
+  /* Better rethrow on main thread than panic. */
+  {
+    if (L != mainthread(g))
+      lj_err_throw(mainthread(g), errcode);
+    if (g->panic) {
+      L->base = L->stack+1;
+      unwindstack(L, L->base, errcode);
+      g->panic(L);
+    }
+  }
+  exit(EXIT_FAILURE);
+}
+
+/* Find error function for runtime errors. Requires an extra stack traversal. */
+static ptrdiff_t finderrfunc(lua_State *L)
+{
+  TValue *frame = L->base-1;
+  void *cf = L->cframe;
+  while (frame > L->stack) {
+    lua_assert(cf != NULL);
+    while (cframe_nres(cframe_raw(cf)) < 0) {  /* cframe without frame? */
+      if (frame >= restorestack(L, -cframe_nres(cf)))
+	break;
+      if (cframe_errfunc(cf) >= 0)  /* Error handler not inherited (-1)? */
+	return cframe_errfunc(cf);
+      cf = cframe_prev(cf);  /* Else unwind cframe and continue searching. */
+      if (cf == NULL)
+	return 0;
+    }
+    switch (frame_typep(frame)) {
+    case FRAME_LUA:
+    case FRAME_LUAP:
+      frame = frame_prevl(frame);
+      break;
+    case FRAME_C:
+      if (cframe_canyield(cf)) return 0;
+      cf = cframe_prev(cf);
+      /* fallthrough */
+    case FRAME_CONT:
+    case FRAME_VARG:
+      frame = frame_prevd(frame);
+      break;
+    case FRAME_CP:
+      if (cframe_errfunc(cf) >= 0)
+	return cframe_errfunc(cf);
+      frame = frame_prevd(frame);
+      break;
+    case FRAME_PCALL:
+    case FRAME_PCALLH:
+      if (frame_ftsz(frame) >= (ptrdiff_t)(2*sizeof(TValue)))  /* xpcall? */
+	return savestack(L, frame-1);  /* Point to xpcall's errorfunc. */
+      return 0;
+    default:
+      lua_assert(0);
+      return 0;
+    }
+  }
+  return 0;
+}
+
+/* Runtime error. */
+LJ_NOINLINE void lj_err_run(lua_State *L)
+{
+  ptrdiff_t ef = finderrfunc(L);
+  if (ef) {
+    TValue *errfunc = restorestack(L, ef);
+    TValue *top = L->top;
+    lj_trace_abort(G(L));
+    if (!tvisfunc(errfunc) || L->status == LUA_ERRERR)
+      lj_err_throw(L, LUA_ERRERR);
+    L->status = LUA_ERRERR;
+    copyTV(L, top, top-1);
+    copyTV(L, top-1, errfunc);
+    L->top = top+1;
+    lj_vm_call(L, top, 1+1);  /* Stack: |errfunc|msg| -> |msg| */
+  }
+  lj_err_throw(L, LUA_ERRRUN);
+}
+
+/* Add location to error message. */
+LJ_NOINLINE static void err_loc(lua_State *L, const char *msg,
+				cTValue *frame, cTValue *nextframe)
+{
+  if (frame) {
+    GCfunc *fn = frame_func(frame);
+    if (isluafunc(fn)) {
+      char buff[LUA_IDSIZE];
+      BCLine line = currentline(L, fn, nextframe);
+      err_chunkid(buff, strdata(funcproto(fn)->chunkname));
+      lj_str_pushf(L, "%s:%d: %s", buff, line, msg);
+      return;
+    }
+  }
+  lj_str_pushf(L, "%s", msg);
+}
+
+/* Formatted runtime error message. */
+LJ_NORET LJ_NOINLINE static void err_msgv(lua_State *L, ErrMsg em, ...)
+{
+  const char *msg;
+  va_list argp;
+  va_start(argp, em);
+  if (curr_funcisL(L)) L->top = curr_topL(L);
+  msg = lj_str_pushvf(L, err2msg(em), argp);
+  va_end(argp);
+  err_loc(L, msg, L->base-1, NULL);
+  lj_err_run(L);
+}
+
+/* Non-vararg variant for better calling conventions. */
+LJ_NOINLINE void lj_err_msg(lua_State *L, ErrMsg em)
+{
+  err_msgv(L, em);
+}
+
+/* Lexer error. */
+LJ_NOINLINE void lj_err_lex(lua_State *L, const char *src, const char *tok,
+			    BCLine line, ErrMsg em, va_list argp)
+{
+  char buff[LUA_IDSIZE];
+  const char *msg;
+  err_chunkid(buff, src);
+  msg = lj_str_pushvf(L, err2msg(em), argp);
+  msg = lj_str_pushf(L, "%s:%d: %s", buff, line, msg);
+  if (tok)
+    lj_str_pushf(L, err2msg(LJ_ERR_XNEAR), msg, tok);
+  lj_err_throw(L, LUA_ERRSYNTAX);
+}
+
+/* Typecheck error for operands. */
+LJ_NOINLINE void lj_err_optype(lua_State *L, cTValue *o, ErrMsg opm)
+{
+  const char *tname = typename(o);
+  const char *oname = NULL;
+  const char *opname = err2msg(opm);
+  if (curr_funcisL(L)) {
+    GCproto *pt = curr_proto(L);
+    const BCIns *pc = cframe_Lpc(L) - 1;
+    const char *kind = getobjname(pt, pc, (BCReg)(o - L->base), &oname);
+    if (kind)
+      err_msgv(L, LJ_ERR_BADOPRT, opname, kind, oname, tname);
+  }
+  err_msgv(L, LJ_ERR_BADOPRV, opname, tname);
+}
+
+/* Typecheck error for ordered comparisons. */
+LJ_NOINLINE void lj_err_comp(lua_State *L, cTValue *o1, cTValue *o2)
+{
+  const char *t1 = typename(o1);
+  const char *t2 = typename(o2);
+  err_msgv(L, t1 == t2 ? LJ_ERR_BADCMPV : LJ_ERR_BADCMPT, t1, t2);
+  /* This assumes the two "boolean" entries are commoned by the C compiler. */
+}
+
+/* Typecheck error for __call. */
+LJ_NOINLINE void lj_err_optype_call(lua_State *L, TValue *o)
+{
+  /* Gross hack if lua_[p]call or pcall/xpcall fail for a non-callable object:
+  ** L->base still points to the caller. So add a dummy frame with L instead
+  ** of a function. See lua_getstack().
+  */
+  const BCIns *pc = cframe_Lpc(L);
+  if (((ptrdiff_t)pc & FRAME_TYPE) != FRAME_LUA) {
+    const char *tname = typename(o);
+    setframe_pc(o, pc);
+    setframe_gc(o, obj2gco(L));
+    L->top = L->base = o+1;
+    err_msgv(L, LJ_ERR_BADCALL, tname);
+  }
+  lj_err_optype(L, o, LJ_ERR_OPCALL);
+}
+
+/* Error in context of caller. */
+LJ_NOINLINE void lj_err_callermsg(lua_State *L, const char *msg)
+{
+  cTValue *frame = L->base-1;
+  cTValue *pframe = frame_islua(frame) ? frame_prevl(frame) : NULL;
+  err_loc(L, msg, pframe, frame);
+  lj_err_run(L);
+}
+
+/* Formatted error in context of caller. */
+LJ_NOINLINE void lj_err_callerv(lua_State *L, ErrMsg em, ...)
+{
+  const char *msg;
+  va_list argp;
+  va_start(argp, em);
+  msg = lj_str_pushvf(L, err2msg(em), argp);
+  va_end(argp);
+  lj_err_callermsg(L, msg);
+}
+
+/* Error in context of caller. */
+LJ_NOINLINE void lj_err_caller(lua_State *L, ErrMsg em)
+{
+  lj_err_callermsg(L, err2msg(em));
+}
+
+/* Argument error message. */
+LJ_NORET LJ_NOINLINE static void err_argmsg(lua_State *L, int narg,
+					    const char *msg)
+{
+  const char *fname = "?";
+  const char *ftype = getfuncname(L, L->base - 1, &fname);
+  if (ftype && ftype[3] == 'h' && --narg == 0)  /* Check for "method". */
+    msg = lj_str_pushf(L, err2msg(LJ_ERR_BADSELF), fname, msg);
+  else
+    msg = lj_str_pushf(L, err2msg(LJ_ERR_BADARG), narg, fname, msg);
+  lj_err_callermsg(L, msg);
+}
+
+/* Formatted argument error. */
+LJ_NOINLINE void lj_err_argv(lua_State *L, int narg, ErrMsg em, ...)
+{
+  const char *msg;
+  va_list argp;
+  va_start(argp, em);
+  msg = lj_str_pushvf(L, err2msg(em), argp);
+  va_end(argp);
+  err_argmsg(L, narg, msg);
+}
+
+/* Argument error. */
+LJ_NOINLINE void lj_err_arg(lua_State *L, int narg, ErrMsg em)
+{
+  err_argmsg(L, narg, err2msg(em));
+}
+
+/* Typecheck error for arguments. */
+LJ_NOINLINE void lj_err_argtype(lua_State *L, int narg, const char *xname)
+{
+  TValue *o = L->base + narg-1;
+  const char *tname = o < L->top ? typename(o) : lj_obj_typename[0];
+  const char *msg = lj_str_pushf(L, err2msg(LJ_ERR_BADTYPE), xname, tname);
+  err_argmsg(L, narg, msg);
+}
+
+/* Typecheck error for arguments. */
+LJ_NOINLINE void lj_err_argt(lua_State *L, int narg, int tt)
+{
+  lj_err_argtype(L, narg, lj_obj_typename[tt+1]);
+}
+
+/* -- Public error handling API ------------------------------------------- */
+
+LUA_API lua_CFunction lua_atpanic(lua_State *L, lua_CFunction panicf)
+{
+  lua_CFunction old = G(L)->panic;
+  G(L)->panic = panicf;
+  return old;
+}
+
+/* Forwarders for the public API (C calling convention and no LJ_NORET). */
+LUA_API int lua_error(lua_State *L)
+{
+  lj_err_run(L);
+  return 0;  /* unreachable */
+}
+
+LUALIB_API int luaL_argerror(lua_State *L, int narg, const char *msg)
+{
+  err_argmsg(L, narg, msg);
+  return 0;  /* unreachable */
+}
+
+LUALIB_API int luaL_typerror(lua_State *L, int narg, const char *xname)
+{
+  lj_err_argtype(L, narg, xname);
+  return 0;  /* unreachable */
+}
+
+LUALIB_API void luaL_where(lua_State *L, int level)
+{
+  int size;
+  cTValue *frame = lj_err_getframe(L, level, &size);
+  err_loc(L, "", frame, size ? frame+size : NULL);
+}
+
+LUALIB_API int luaL_error(lua_State *L, const char *fmt, ...)
+{
+  const char *msg;
+  va_list argp;
+  va_start(argp, fmt);
+  msg = lj_str_pushvf(L, fmt, argp);
+  va_end(argp);
+  lj_err_callermsg(L, msg);
+  return 0;  /* unreachable */
+}
+

+ 40 - 0
src/lj_err.h

@@ -0,0 +1,40 @@
+/*
+** Error handling and debugging support.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_ERR_H
+#define _LJ_ERR_H
+
+#include <stdarg.h>
+
+#include "lj_obj.h"
+
+typedef enum {
+#define ERRDEF(name, msg) \
+  LJ_ERR_##name, LJ_ERR_##name##_ = LJ_ERR_##name + sizeof(msg)-1,
+#include "lj_errmsg.h"
+  LJ_ERR__MAX
+} ErrMsg;
+
+LJ_FUNC GCstr *lj_err_str(lua_State *L, ErrMsg em);
+LJ_FUNC_NORET void lj_err_throw(lua_State *L, int errcode);
+LJ_FUNC_NORET void lj_err_run(lua_State *L);
+LJ_FUNC_NORET void lj_err_msg(lua_State *L, ErrMsg em);
+LJ_FUNC_NORET void lj_err_lex(lua_State *L, const char *src, const char *tok,
+			      BCLine line, ErrMsg em, va_list argp);
+LJ_FUNC_NORET void lj_err_optype(lua_State *L, cTValue *o, ErrMsg opm);
+LJ_FUNC_NORET void lj_err_comp(lua_State *L, cTValue *o1, cTValue *o2);
+LJ_FUNC_NORET void lj_err_optype_call(lua_State *L, TValue *o);
+LJ_FUNC_NORET void lj_err_callermsg(lua_State *L, const char *msg);
+LJ_FUNC_NORET void lj_err_callerv(lua_State *L, ErrMsg em, ...);
+LJ_FUNC_NORET void lj_err_caller(lua_State *L, ErrMsg em);
+LJ_FUNC_NORET void lj_err_arg(lua_State *L, int narg, ErrMsg em);
+LJ_FUNC_NORET void lj_err_argv(lua_State *L, int narg, ErrMsg em, ...);
+LJ_FUNC_NORET void lj_err_argtype(lua_State *L, int narg, const char *xname);
+LJ_FUNC_NORET void lj_err_argt(lua_State *L, int narg, int tt);
+
+LJ_FUNC void lj_err_pushloc(lua_State *L, GCproto *pt, BCPos pc);
+LJ_FUNC cTValue *lj_err_getframe(lua_State *L, int level, int *size);
+
+#endif

+ 134 - 0
src/lj_errmsg.h

@@ -0,0 +1,134 @@
+/*
+** VM error messages.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+/* This file may be included multiple times with different ERRDEF macros. */
+
+/* Basic error handling. */
+ERRDEF(ERRMEM,	"not enough memory")
+ERRDEF(ERRERR,	"error in error handling")
+
+/* Allocations. */
+ERRDEF(STROV,	"string length overflow")
+ERRDEF(UDATAOV,	"userdata length overflow")
+ERRDEF(STKOV,	"stack overflow")
+ERRDEF(STKOVM,	"stack overflow (%s)")
+ERRDEF(TABOV,	"table overflow")
+
+/* Table indexing. */
+ERRDEF(NANIDX,	"table index is NaN")
+ERRDEF(NILIDX,	"table index is nil")
+ERRDEF(NEXTIDX,	"invalid key to " LUA_QL("next"))
+
+/* Metamethod resolving. */
+ERRDEF(BADCALL,	"attempt to call a %s value")
+ERRDEF(BADOPRT,	"attempt to %s %s " LUA_QS " (a %s value)")
+ERRDEF(BADOPRV,	"attempt to %s a %s value")
+ERRDEF(BADCMPT,	"attempt to compare %s with %s")
+ERRDEF(BADCMPV,	"attempt to compare two %s values")
+ERRDEF(GETLOOP,	"loop in gettable")
+ERRDEF(SETLOOP,	"loop in settable")
+ERRDEF(OPCALL,	"call")
+ERRDEF(OPINDEX,	"index")
+ERRDEF(OPARITH,	"perform arithmetic on")
+ERRDEF(OPCAT,	"concatenate")
+ERRDEF(OPLEN,	"get length of")
+
+/* Type checks. */
+ERRDEF(BADSELF,	"calling " LUA_QS " on bad self (%s)")
+ERRDEF(BADARG,	"bad argument #%d to " LUA_QS " (%s)")
+ERRDEF(BADTYPE,	"%s expected, got %s")
+ERRDEF(BADVAL,	"invalid value")
+ERRDEF(NOVAL,	"value expected")
+ERRDEF(NOCORO,	"coroutine expected")
+ERRDEF(NOTABN,	"nil or table expected")
+ERRDEF(NOLFUNC,	"Lua function expected")
+ERRDEF(NOFUNCL,	"function or level expected")
+ERRDEF(NOSFT,	"string/function/table expected")
+ERRDEF(NOPROXY,	"boolean or proxy expected")
+ERRDEF(FORINIT,	LUA_QL("for") " initial value must be a number")
+ERRDEF(FORLIM,	LUA_QL("for") " limit must be a number")
+ERRDEF(FORSTEP,	LUA_QL("for") " step must be a number")
+
+/* C API checks. */
+ERRDEF(NOENV,	"no calling environment")
+ERRDEF(CYIELD,	"attempt to yield across C-call boundary")
+ERRDEF(BADLU,	"bad light userdata pointer")
+ERRDEF(NOGCMM,	"bad action while in __gc metamethod")
+
+/* Standard library function errors. */
+ERRDEF(ASSERT,	"assertion failed!")
+ERRDEF(PROTMT,	"cannot change a protected metatable")
+ERRDEF(UNPACK,	"too many results to unpack")
+ERRDEF(RDRSTR,	"reader function must return a string")
+ERRDEF(PRTOSTR,	LUA_QL("tostring") " must return a string to " LUA_QL("print"))
+ERRDEF(IDXRNG,	"index out of range")
+ERRDEF(BASERNG,	"base out of range")
+ERRDEF(LVLRNG,	"level out of range")
+ERRDEF(INVLVL,	"invalid level")
+ERRDEF(INVOPT,	"invalid option")
+ERRDEF(INVOPTM,	"invalid option " LUA_QS)
+ERRDEF(INVFMT,	"invalid format")
+ERRDEF(SETFENV,	LUA_QL("setfenv") " cannot change environment of given object")
+ERRDEF(CORUN,	"cannot resume running coroutine")
+ERRDEF(CODEAD,	"cannot resume dead coroutine")
+ERRDEF(COSUSP,	"cannot resume non-suspended coroutine")
+ERRDEF(TABINS,	"wrong number of arguments to " LUA_QL("insert"))
+ERRDEF(TABCAT,	"invalid value (%s) at index %d in table for " LUA_QL("concat"))
+ERRDEF(TABSORT,	"invalid order function for sorting")
+ERRDEF(IOCLFL,	"attempt to use a closed file")
+ERRDEF(IOSTDCL,	"standard file is closed")
+ERRDEF(OSUNIQF,	"unable to generate a unique filename")
+ERRDEF(OSDATEF,	"field " LUA_QS " missing in date table")
+ERRDEF(STRDUMP,	"cannot dump functions")
+ERRDEF(STRSLC,	"string slice too long")
+ERRDEF(STRPATB,	"missing " LUA_QL("[") " after " LUA_QL("%f") " in pattern")
+ERRDEF(STRPATC,	"invalid pattern capture")
+ERRDEF(STRPATE,	"malformed pattern (ends with " LUA_QL("%") ")")
+ERRDEF(STRPATM,	"malformed pattern (missing " LUA_QL("]") ")")
+ERRDEF(STRPATU,	"unbalanced pattern")
+ERRDEF(STRCAPI,	"invalid capture index")
+ERRDEF(STRCAPN,	"too many captures")
+ERRDEF(STRCAPU,	"unfinished capture")
+ERRDEF(STRFMTO,	"invalid option " LUA_QL("%%%c") " to " LUA_QL("format"))
+ERRDEF(STRFMTR,	"invalid format (repeated flags)")
+ERRDEF(STRFMTW,	"invalid format (width or precision too long)")
+ERRDEF(STRGSRV,	"invalid replacement value (a %s)")
+ERRDEF(BADMODN,	"name conflict for module " LUA_QS)
+ERRDEF(NOJIT,	"JIT compiler permanently disabled")
+ERRDEF(JITOPT,	"unknown or malformed optimization flag " LUA_QS)
+
+/* Lexer/parser errors. */
+ERRDEF(XNEAR,	"%s near " LUA_QS)
+ERRDEF(XELEM,	"lexical element too long")
+ERRDEF(XLINES,	"chunk has too many lines")
+ERRDEF(XLEVELS,	"chunk has too many syntax levels")
+ERRDEF(XNUMBER,	"malformed number")
+ERRDEF(XLSTR,	"unfinished long string")
+ERRDEF(XLCOM,	"unfinished long comment")
+ERRDEF(XSTR,	"unfinished string")
+ERRDEF(XESC,	"escape sequence too large")
+ERRDEF(XLDELIM,	"invalid long string delimiter")
+ERRDEF(XBCLOAD,	"cannot load Lua bytecode")
+ERRDEF(XTOKEN,	LUA_QS " expected")
+ERRDEF(XJUMP,	"control structure too long")
+ERRDEF(XSLOTS,	"function or expression too complex")
+ERRDEF(XLIMM,	"main function has more than %d %s")
+ERRDEF(XLIMF,	"function at line %d has more than %d %s")
+ERRDEF(XMATCH,	LUA_QS " expected (to close " LUA_QS " at line %d)")
+ERRDEF(XFIXUP,	"function too long for return fixup")
+ERRDEF(XPARAM,	"<name> or " LUA_QL("...") " expected")
+ERRDEF(XAMBIG,	"ambiguous syntax (function call x new statement)")
+ERRDEF(XFUNARG,	"function arguments expected")
+ERRDEF(XSYMBOL,	"unexpected symbol")
+ERRDEF(XDOTS,	"cannot use " LUA_QL("...") " outside a vararg function")
+ERRDEF(XSYNTAX,	"syntax error")
+ERRDEF(XBREAK,	"no loop to break")
+ERRDEF(XFOR,	LUA_QL("=") " or " LUA_QL("in") " expected")
+
+#undef ERRDEF
+
+/* Detecting unused error messages:
+   awk -F, '/^ERRDEF/ { gsub(/ERRDEF./, ""); printf "grep -q LJ_ERR_%s *.[ch] || echo %s\n", $1, $1}' lj_errmsg.h | sh
+*/

+ 18 - 0
src/lj_ff.h

@@ -0,0 +1,18 @@
+/*
+** Fast function IDs.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_FF_H
+#define _LJ_FF_H
+
+/* Fast function ID. */
+typedef enum {
+  FF_LUA_ = FF_LUA,	/* Lua function (must be 0). */
+  FF_C_ = FF_C,		/* Regular C function (must be 1). */
+#define FFDEF(name)	FF_##name,
+#include "lj_ffdef.h"
+  FF__MAX
+} FastFunc;
+
+#endif

+ 84 - 0
src/lj_frame.h

@@ -0,0 +1,84 @@
+/*
+** Stack frames.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_FRAME_H
+#define _LJ_FRAME_H
+
+#include "lj_obj.h"
+#include "lj_bc.h"
+
+/* -- Lua stack frame ----------------------------------------------------- */
+
+/* Frame type markers in callee function slot (callee base-1). */
+enum {
+  FRAME_LUA, FRAME_C, FRAME_CONT, FRAME_VARG,
+  FRAME_LUAP, FRAME_CP, FRAME_PCALL, FRAME_PCALLH
+};
+#define FRAME_TYPE		3
+#define FRAME_P			4
+#define FRAME_TYPEP		(FRAME_TYPE|FRAME_P)
+
+/* Macros to access and modify Lua frames. */
+#define frame_gc(f)		(gcref((f)->fr.func))
+#define frame_func(f)		(&frame_gc(f)->fn)
+#define frame_ftsz(f)		((f)->fr.tp.ftsz)
+
+#define frame_type(f)		(frame_ftsz(f) & FRAME_TYPE)
+#define frame_typep(f)		(frame_ftsz(f) & FRAME_TYPEP)
+#define frame_islua(f)		(frame_type(f) == FRAME_LUA)
+#define frame_isc(f)		(frame_type(f) == FRAME_C)
+#define frame_iscont(f)		(frame_typep(f) == FRAME_CONT)
+#define frame_isvarg(f)		(frame_typep(f) == FRAME_VARG)
+#define frame_ispcall(f)	((frame_ftsz(f) & 6) == FRAME_PCALL)
+
+#define frame_pc(f)		(mref((f)->fr.tp.pcr, const BCIns))
+#define frame_contpc(f)		(frame_pc((f)-1))
+#if LJ_64
+#define frame_contf(f) \
+  ((ASMFunction)(void *)((intptr_t)lj_vm_asm_begin+(((f)-1)->u64 & 0xffffffff)))
+#else
+#define frame_contf(f)		((ASMFunction)gcrefp(((f)-1)->gcr, void))
+#endif
+#define frame_delta(f)		(frame_ftsz(f) >> 3)
+#define frame_sized(f)		(frame_ftsz(f) & ~FRAME_TYPEP)
+
+#define frame_prevl(f)		((f) - (1+bc_a(frame_pc(f)[-1])))
+#define frame_prevd(f)		((TValue *)((char *)(f) - frame_sized(f)))
+#define frame_prev(f)		(frame_islua(f)?frame_prevl(f):frame_prevd(f))
+/* Note: this macro does not skip over FRAME_VARG. */
+
+#define setframe_pc(f, pc)	(setmref((f)->fr.tp.pcr, (pc)))
+#define setframe_gc(f, p)	(setgcref((f)->fr.func, (p)))
+
+/* -- C stack frame ------------------------------------------------------- */
+
+/* Macros to access and modify the C stack frame chain. */
+
+/* These definitions must match with the arch-specific *.dasc files. */
+#if LJ_TARGET_X86
+#define CFRAME_OFS_ERRF		(15*sizeof(void *))
+#define CFRAME_OFS_NRES		(14*sizeof(void *))
+#define CFRAME_OFS_PREV		(13*sizeof(void *))
+#define CFRAME_OFS_L		(12*sizeof(void *))
+#define CFRAME_OFS_PC		(6*sizeof(void *))
+#define CFRAME_SIZE		(12*sizeof(void *))
+#else
+#error "Missing CFRAME_* definitions for this architecture"
+#endif
+
+#define CFRAME_RESUME		1
+#define CFRAME_CANYIELD		((intptr_t)(CFRAME_RESUME))
+#define CFRAME_RAWMASK		(~CFRAME_CANYIELD)
+
+#define cframe_errfunc(cf)	(*(ptrdiff_t *)(((char *)cf)+CFRAME_OFS_ERRF))
+#define cframe_nres(cf)		(*(ptrdiff_t *)(((char *)cf)+CFRAME_OFS_NRES))
+#define cframe_prev(cf)		(*(void **)(((char *)cf)+CFRAME_OFS_PREV))
+#define cframe_L(cf)		(*(lua_State **)(((char *)cf)+CFRAME_OFS_L))
+#define cframe_pc(cf)		(*(const BCIns **)(((char *)cf)+CFRAME_OFS_PC))
+#define cframe_canyield(cf)	((intptr_t)(cf) & CFRAME_CANYIELD)
+#define cframe_raw(cf)		((void *)((intptr_t)(cf) & CFRAME_RAWMASK))
+#define cframe_Lpc(L)		cframe_pc(cframe_raw(L->cframe))
+
+#endif

+ 185 - 0
src/lj_func.c

@@ -0,0 +1,185 @@
+/*
+** Function handling (prototypes, functions and upvalues).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_func_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_func.h"
+#include "lj_trace.h"
+#include "lj_vm.h"
+
+/* -- Prototypes ---------------------------------------------------------- */
+
+GCproto *lj_func_newproto(lua_State *L)
+{
+  GCproto *pt = lj_mem_newobj(L, GCproto);
+  pt->gct = ~LJ_TPROTO;
+  pt->numparams = 0;
+  pt->framesize = 0;
+  pt->sizeuv = 0;
+  pt->flags = 0;
+  pt->trace = 0;
+  pt->k.n = NULL;
+  pt->bc = NULL;
+  pt->uv = NULL;
+  pt->sizebc = 0;
+  pt->sizekgc = 0;
+  pt->sizekn = 0;
+  pt->sizelineinfo = 0;
+  pt->sizevarinfo = 0;
+  pt->sizeuvname = 0;
+  pt->linedefined = 0;
+  pt->lastlinedefined = 0;
+  pt->lineinfo = NULL;
+  pt->varinfo = NULL;
+  pt->uvname = NULL;
+  pt->chunkname = NULL;
+  return pt;
+}
+
+void LJ_FASTCALL lj_func_freeproto(global_State *g, GCproto *pt)
+{
+  MSize nkgc = round_nkgc(pt->sizekgc);
+  MSize sizek = nkgc*(MSize)sizeof(GCobj *) +
+		pt->sizekn*(MSize)sizeof(lua_Number);
+  lj_mem_free(g, pt->k.gc - nkgc, sizek);
+  lj_mem_freevec(g, pt->bc, pt->sizebc, BCIns);
+  lj_mem_freevec(g, pt->uv, pt->sizeuv, int16_t);
+  lj_mem_freevec(g, pt->lineinfo, pt->sizelineinfo, int32_t);
+  lj_mem_freevec(g, pt->varinfo, pt->sizevarinfo, struct VarInfo);
+  lj_mem_freevec(g, pt->uvname, pt->sizeuvname, GCstr *);
+  lj_trace_freeproto(g, pt);
+  lj_mem_freet(g, pt);
+}
+
+/* -- Upvalues ------------------------------------------------------------ */
+
+static void unlinkuv(GCupval *uv)
+{
+  lua_assert(uvprev(uvnext(uv)) == uv && uvnext(uvprev(uv)) == uv);
+  setgcrefr(uvnext(uv)->prev, uv->prev);
+  setgcrefr(uvprev(uv)->next, uv->next);
+}
+
+/* Find existing open upvalue for a stack slot or create a new one. */
+static GCupval *func_finduv(lua_State *L, TValue *slot)
+{
+  global_State *g = G(L);
+  GCRef *pp = &L->openupval;
+  GCupval *p;
+  GCupval *uv;
+  /* Search the sorted list of open upvalues. */
+  while (gcref(*pp) != NULL && (p = gco2uv(gcref(*pp)))->v >= slot) {
+    lua_assert(!p->closed && p->v != &p->tv);
+    if (p->v == slot) {  /* Found open upvalue pointing to same slot? */
+      if (isdead(g, obj2gco(p)))  /* Resurrect it, if it's dead. */
+	flipwhite(obj2gco(p));
+      return p;
+    }
+    pp = &p->nextgc;
+  }
+  /* No matching upvalue found. Create a new one. */
+  uv = lj_mem_newt(L, sizeof(GCupval), GCupval);
+  newwhite(g, uv);
+  uv->gct = ~LJ_TUPVAL;
+  uv->closed = 0;  /* Still open. */
+  uv->v = slot;  /* Pointing to the stack slot. */
+  /* NOBARRIER: The GCupval is new (marked white) and open. */
+  setgcrefr(uv->nextgc, *pp);  /* Insert into sorted list of open upvalues. */
+  setgcref(*pp, obj2gco(uv));
+  setgcref(uv->prev, obj2gco(&g->uvhead));  /* Insert into GC list, too. */
+  setgcrefr(uv->next, g->uvhead.next);
+  setgcref(uvnext(uv)->prev, obj2gco(uv));
+  setgcref(g->uvhead.next, obj2gco(uv));
+  lua_assert(uvprev(uvnext(uv)) == uv && uvnext(uvprev(uv)) == uv);
+  return uv;
+}
+
+/* Close all open upvalues pointing to some stack level or above. */
+void lj_func_closeuv(lua_State *L, TValue *level)
+{
+  GCupval *uv;
+  global_State *g = G(L);
+  while (gcref(L->openupval) != NULL &&
+	 (uv = gco2uv(gcref(L->openupval)))->v >= level) {
+    GCobj *o = obj2gco(uv);
+    lua_assert(!isblack(o) && !uv->closed && uv->v != &uv->tv);
+    setgcrefr(L->openupval, uv->nextgc);  /* No longer in open list. */
+    if (isdead(g, o)) {
+      lj_func_freeuv(g, uv);
+    } else {
+      unlinkuv(uv);
+      lj_gc_closeuv(g, uv);
+    }
+  }
+}
+
+void LJ_FASTCALL lj_func_freeuv(global_State *g, GCupval *uv)
+{
+  if (!uv->closed)
+    unlinkuv(uv);
+  lj_mem_freet(g, uv);
+}
+
+/* -- Functions (closures) ------------------------------------------------ */
+
+GCfunc *lj_func_newC(lua_State *L, MSize nelems, GCtab *env)
+{
+  GCfunc *fn = cast(GCfunc *, lj_mem_newgco(L, sizeCfunc(nelems)));
+  fn->c.gct = ~LJ_TFUNC;
+  fn->c.ffid = FF_C;
+  fn->c.nupvalues = cast_byte(nelems);
+  /* NOBARRIER: The GCfunc is new (marked white). */
+  setgcref(fn->c.env, obj2gco(env));
+  fn->c.gate = lj_gate_c;
+  return fn;
+}
+
+GCfunc *lj_func_newL(lua_State *L, GCproto *pt, GCtab *env)
+{
+  GCfunc *fn = cast(GCfunc *, lj_mem_newgco(L, sizeLfunc((MSize)pt->sizeuv)));
+  fn->l.gct = ~LJ_TFUNC;
+  fn->l.ffid = FF_LUA;
+  fn->l.nupvalues = cast_byte(pt->sizeuv);
+  /* NOBARRIER: The GCfunc is new (marked white). */
+  setgcref(fn->l.pt, obj2gco(pt));
+  setgcref(fn->l.env, obj2gco(env));
+  fn->l.gate = (pt->flags & PROTO_IS_VARARG) ? lj_gate_lv : lj_gate_lf;
+  return fn;
+}
+
+/* Do a GC check and create a new Lua function with inherited upvalues. */
+GCfunc *lj_func_newL_gc(lua_State *L, GCproto *pt, GCfuncL *parent)
+{
+  GCfunc *fn;
+  GCRef *puv;
+  uint32_t i, nuv;
+  TValue *base;
+  lj_gc_check_fixtop(L);
+  fn = lj_func_newL(L, pt, tabref(parent->env));
+  /* NOBARRIER: The GCfunc is new (marked white). */
+  puv = parent->uvptr;
+  nuv = fn->l.nupvalues;
+  base = L->base;
+  for (i = 0; i < nuv; i++) {
+    int v = pt->uv[i];
+    GCupval *uv = v < 0 ? &gcref(puv[~v])->uv : func_finduv(L, base + v);
+    setgcref(fn->l.uvptr[i], obj2gco(uv));
+  }
+  return fn;
+}
+
+void LJ_FASTCALL lj_func_free(global_State *g, GCfunc *fn)
+{
+  MSize size = isluafunc(fn) ? sizeLfunc((MSize)fn->l.nupvalues) :
+			       sizeCfunc((MSize)fn->c.nupvalues);
+  lj_mem_free(g, fn, size);
+}
+

+ 25 - 0
src/lj_func.h

@@ -0,0 +1,25 @@
+/*
+** Function handling (prototypes, functions and upvalues).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_FUNC_H
+#define _LJ_FUNC_H
+
+#include "lj_obj.h"
+
+/* Prototypes. */
+LJ_FUNC GCproto *lj_func_newproto(lua_State *L);
+LJ_FUNC void LJ_FASTCALL lj_func_freeproto(global_State *g, GCproto *pt);
+
+/* Upvalues. */
+LJ_FUNCA void lj_func_closeuv(lua_State *L, TValue *level);
+LJ_FUNC void LJ_FASTCALL lj_func_freeuv(global_State *g, GCupval *uv);
+
+/* Functions (closures). */
+LJ_FUNC GCfunc *lj_func_newC(lua_State *L, MSize nelems, GCtab *env);
+LJ_FUNC GCfunc *lj_func_newL(lua_State *L, GCproto *pt, GCtab *env);
+LJ_FUNCA GCfunc *lj_func_newL_gc(lua_State *L, GCproto *pt, GCfuncL *parent);
+LJ_FUNC void LJ_FASTCALL lj_func_free(global_State *g, GCfunc *c);
+
+#endif

+ 800 - 0
src/lj_gc.c

@@ -0,0 +1,800 @@
+/*
+** Garbage collector.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_gc_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_udata.h"
+#include "lj_meta.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_trace.h"
+#include "lj_vm.h"
+
+#define GCSTEPSIZE	1024u
+#define GCSWEEPMAX	40
+#define GCSWEEPCOST	10
+#define GCFINALIZECOST	100
+
+/* Macros to set GCobj colors and flags. */
+#define white2gray(x)		((x)->gch.marked &= cast_byte(~LJ_GC_WHITES))
+#define black2gray(x)		((x)->gch.marked &= cast_byte(~LJ_GC_BLACK))
+#define gray2black(x)		((x)->gch.marked |= LJ_GC_BLACK)
+#define makewhite(g, x) \
+  ((x)->gch.marked = ((x)->gch.marked & cast_byte(~LJ_GC_COLORS)) | curwhite(g))
+#define isfinalized(u)		((u)->marked & LJ_GC_FINALIZED)
+#define markfinalized(u)	((u)->marked |= LJ_GC_FINALIZED)
+
+/* -- Mark phase ---------------------------------------------------------- */
+
+/* Mark a TValue (if needed). */
+#define gc_marktv(g, tv) \
+  { lua_assert(!tvisgcv(tv) || (~itype(tv) == gcval(tv)->gch.gct)); \
+    if (tviswhite(tv)) gc_mark(g, gcV(tv)); }
+
+/* Mark a GCobj (if needed). */
+#define gc_markobj(g, o) \
+  { if (iswhite(obj2gco(o))) gc_mark(g, obj2gco(o)); }
+
+/* Mark a string object. */
+#define gc_mark_str(s)		((s)->marked &= cast_byte(~LJ_GC_WHITES))
+
+/* Mark a white GCobj. */
+static void gc_mark(global_State *g, GCobj *o)
+{
+  lua_assert(iswhite(o) && !isdead(g, o));
+  white2gray(o);
+  if (LJ_UNLIKELY(o->gch.gct == ~LJ_TUDATA)) {
+    GCtab *mt = tabref(gco2ud(o)->metatable);
+    gray2black(o);  /* Userdata are never gray. */
+    if (mt) gc_markobj(g, mt);
+    gc_markobj(g, tabref(gco2ud(o)->env));
+  } else if (LJ_UNLIKELY(o->gch.gct == ~LJ_TUPVAL)) {
+    GCupval *uv = gco2uv(o);
+    gc_marktv(g, uv->v);
+    if (uv->closed)
+      gray2black(o);  /* Closed upvalues are never gray. */
+  } else if (o->gch.gct != ~LJ_TSTR) {
+    lua_assert(o->gch.gct == ~LJ_TFUNC || o->gch.gct == ~LJ_TTAB ||
+	       o->gch.gct == ~LJ_TTHREAD || o->gch.gct == ~LJ_TPROTO);
+    setgcrefr(o->gch.gclist, g->gc.gray);
+    setgcref(g->gc.gray, o);
+  }
+}
+
+/* Mark the base metatables. */
+static void gc_mark_basemt(global_State *g)
+{
+  int i;
+  for (i = 0; i < BASEMT_MAX; i++)
+    if (tabref(g->basemt[i]) != NULL)
+      gc_markobj(g, tabref(g->basemt[i]));
+}
+
+/* Start a GC cycle and mark the root set. */
+static void gc_mark_start(global_State *g)
+{
+  setgcrefnull(g->gc.gray);
+  setgcrefnull(g->gc.grayagain);
+  setgcrefnull(g->gc.weak);
+  gc_markobj(g, mainthread(g));
+  gc_markobj(g, tabref(mainthread(g)->env));
+  gc_marktv(g, &g->registrytv);
+  gc_mark_basemt(g);
+  g->gc.state = GCSpropagate;
+}
+
+/* Mark open upvalues. */
+static void gc_mark_uv(global_State *g)
+{
+  GCupval *uv;
+  for (uv = uvnext(&g->uvhead); uv != &g->uvhead; uv = uvnext(uv)) {
+    lua_assert(uvprev(uvnext(uv)) == uv && uvnext(uvprev(uv)) == uv);
+    if (isgray(obj2gco(uv)))
+      gc_marktv(g, uv->v);
+  }
+}
+
+/* Mark userdata in mmudata list. */
+static void gc_mark_mmudata(global_State *g)
+{
+  GCobj *root = gcref(g->gc.mmudata);
+  GCobj *u = root;
+  if (u) {
+    do {
+      u = gcnext(u);
+      makewhite(g, u);  /* Could be from previous GC. */
+      gc_mark(g, u);
+    } while (u != root);
+  }
+}
+
+/* Separate userdata which which needs finalization to mmudata list. */
+size_t lj_gc_separateudata(global_State *g, int all)
+{
+  size_t m = 0;
+  GCRef *p = &mainthread(g)->nextgc;
+  GCobj *o;
+  while ((o = gcref(*p)) != NULL) {
+    if (!(iswhite(o) || all) || isfinalized(gco2ud(o))) {
+      p = &o->gch.nextgc;  /* Nothing to do. */
+    } else if (!lj_meta_fastg(g, tabref(gco2ud(o)->metatable), MM_gc)) {
+      markfinalized(gco2ud(o));  /* Done, as there's no __gc metamethod. */
+      p = &o->gch.nextgc;
+    } else {  /* Otherwise move userdata to be finalized to mmudata list. */
+      m += sizeudata(gco2ud(o));
+      markfinalized(gco2ud(o));
+      *p = o->gch.nextgc;
+      if (gcref(g->gc.mmudata)) {  /* Link to end of mmudata list. */
+	GCobj *root = gcref(g->gc.mmudata);
+	setgcrefr(o->gch.nextgc, root->gch.nextgc);
+	setgcref(root->gch.nextgc, o);
+	setgcref(g->gc.mmudata, o);
+      } else {  /* Create circular list. */
+	setgcref(o->gch.nextgc, o);
+	setgcref(g->gc.mmudata, o);
+      }
+    }
+  }
+  return m;
+}
+
+/* -- Propagation phase --------------------------------------------------- */
+
+/* Traverse a table. */
+static int gc_traverse_tab(global_State *g, GCtab *t)
+{
+  int weak = 0;
+  cTValue *mode;
+  GCtab *mt = tabref(t->metatable);
+  if (mt)
+    gc_markobj(g, mt);
+  mode = lj_meta_fastg(g, mt, MM_mode);
+  if (mode && tvisstr(mode)) {  /* Valid __mode field? */
+    const char *modestr = strVdata(mode);
+    int c;
+    while ((c = *modestr++)) {
+      if (c == 'k') weak |= LJ_GC_WEAKKEY;
+      else if (c == 'v') weak |= LJ_GC_WEAKVAL;
+    }
+    if (weak) {  /* Weak tables are cleared in the atomic phase. */
+      t->marked = cast_byte((t->marked & ~LJ_GC_WEAK) | weak);
+      setgcrefr(t->gclist, g->gc.weak);
+      setgcref(g->gc.weak, obj2gco(t));
+    }
+  }
+  if (weak == LJ_GC_WEAK)  /* Nothing to mark if both keys/values are weak. */
+    return 1;
+  if (!(weak & LJ_GC_WEAKVAL)) {  /* Mark array part. */
+    MSize i, asize = t->asize;
+    for (i = 0; i < asize; i++)
+      gc_marktv(g, arrayslot(t, i));
+  }
+  if (t->hmask > 0) {  /* Mark hash part. */
+    Node *node = noderef(t->node);
+    MSize i, hmask = t->hmask;
+    for (i = 0; i <= hmask; i++) {
+      Node *n = &node[i];
+      lua_assert(itype(&n->key) != LJ_TDEADKEY || tvisnil(&n->val));
+      if (!tvisnil(&n->val)) {  /* Mark non-empty slot. */
+	lua_assert(!tvisnil(&n->key));
+	if (!(weak & LJ_GC_WEAKKEY)) gc_marktv(g, &n->key);
+	if (!(weak & LJ_GC_WEAKVAL)) gc_marktv(g, &n->val);
+      } else if (tvisgcv(&n->key)) {  /* Leave GC key in, but mark as dead. */
+	setitype(&n->key, LJ_TDEADKEY);
+      }
+    }
+  }
+  return weak;
+}
+
+/* Traverse a function. */
+static void gc_traverse_func(global_State *g, GCfunc *fn)
+{
+  gc_markobj(g, tabref(fn->c.env));
+  if (isluafunc(fn)) {
+    uint32_t i;
+    lua_assert(fn->l.nupvalues == funcproto(fn)->sizeuv);
+    gc_markobj(g, funcproto(fn));
+    for (i = 0; i < fn->l.nupvalues; i++)  /* Mark Lua function upvalues. */
+      gc_markobj(g, &gcref(fn->l.uvptr[i])->uv);
+  } else {
+    uint32_t i;
+    for (i = 0; i < fn->c.nupvalues; i++)  /* Mark C function upvalues. */
+      gc_marktv(g, &fn->c.upvalue[i]);
+  }
+}
+
+#if LJ_HASJIT
+/* Traverse a trace. */
+static void gc_traverse_trace(global_State *g, Trace *T)
+{
+  IRRef ref;
+  for (ref = T->nk; ref < REF_TRUE; ref++) {
+    IRIns *ir = &T->ir[ref];
+    if (ir->o == IR_KGC)
+      gc_markobj(g, ir_kgc(ir));
+  }
+}
+
+/* The current trace is a GC root while not anchored in the prototype (yet). */
+#define gc_mark_curtrace(g) \
+  { if (G2J(g)->state != LJ_TRACE_IDLE && G2J(g)->curtrace != 0) \
+    gc_traverse_trace(g, &G2J(g)->cur); }
+#else
+#define gc_mark_curtrace(g)	UNUSED(g)
+#endif
+
+/* Traverse a prototype. */
+static void gc_traverse_proto(global_State *g, GCproto *pt)
+{
+  ptrdiff_t i;
+#if LJ_HASJIT
+  jit_State *J = G2J(g);
+  TraceNo root, side;
+  /* Mark all root traces and attached side traces. */
+  for (root = pt->trace; root != 0; root = J->trace[root]->nextroot) {
+    for (side = J->trace[root]->nextside; side != 0;
+	 side = J->trace[side]->nextside)
+      gc_traverse_trace(g, J->trace[side]);
+    gc_traverse_trace(g, J->trace[root]);
+  }
+#endif
+  /* GC during prototype creation could cause NULL fields. */
+  if (pt->chunkname)
+    gc_mark_str(pt->chunkname);
+  for (i = -(ptrdiff_t)pt->sizekgc; i < 0; i++)  /* Mark collectable consts. */
+    gc_markobj(g, gcref(pt->k.gc[i]));
+  for (i = 0; i < (ptrdiff_t)pt->sizeuvname; i++)  /* Mark upvalue names. */
+    if (pt->uvname[i])
+      gc_mark_str(pt->uvname[i]);
+  for (i = 0; i < (ptrdiff_t)pt->sizevarinfo; i++)  /* Mark names of locals. */
+    if (pt->varinfo[i].name)
+      gc_mark_str(pt->varinfo[i].name);
+}
+
+/* Traverse the frame structure of a stack. */
+static TValue *gc_traverse_frames(global_State *g, lua_State *th)
+{
+  TValue *frame, *top = th->top-1;
+  /* Note: extra vararg frame not skipped, marks function twice (harmless). */
+  for (frame = th->base-1; frame > th->stack; frame = frame_prev(frame)) {
+    GCfunc *fn = frame_func(frame);
+    TValue *ftop = frame;
+    if (isluafunc(fn)) ftop += funcproto(fn)->framesize;
+    if (ftop > top) top = ftop;
+    gc_markobj(g, frame_gc(frame));  /* Need to mark hidden function (or L). */
+  }
+  top++;  /* Correct bias of -1 (frame == base-1). */
+  if (top > th->maxstack) top = th->maxstack;
+  return top;
+}
+
+/* Traverse a thread object. */
+static void gc_traverse_thread(global_State *g, lua_State *th)
+{
+  TValue *o, *lim;
+  gc_markobj(g, tabref(th->env));
+  for (o = th->stack+1; o < th->top; o++)
+    gc_marktv(g, o);
+  lim = gc_traverse_frames(g, th);
+  /* Extra cleanup required to avoid this marking problem:
+  **
+  ** [aa[bb.X|   X created.
+  ** [aa[cc|     GC called from (small) inner frame, X destroyed.
+  ** [aa....X.|  GC called again in (larger) outer frame, X resurrected (ouch).
+  **
+  ** During GC in step 2 the stack must be cleaned up to the max. frame extent:
+  **
+  **       ***|  Slots cleaned
+  **    [cc|      from top of last frame
+  ** [aa......|   to max. frame extent.
+  */
+  for (; o <= lim; o++)
+    setnilV(o);
+  lj_state_shrinkstack(th, (MSize)(lim - th->stack));
+}
+
+/* Propagate one gray object. Traverse it and turn it black. */
+static size_t propagatemark(global_State *g)
+{
+  GCobj *o = gcref(g->gc.gray);
+  lua_assert(isgray(o));
+  gray2black(o);
+  setgcrefr(g->gc.gray, o->gch.gclist);  /* Remove from gray list. */
+  if (LJ_LIKELY(o->gch.gct == ~LJ_TTAB)) {
+    GCtab *t = gco2tab(o);
+    if (gc_traverse_tab(g, t))
+      black2gray(o);  /* Keep weak tables gray. */
+    return sizeof(GCtab) + sizeof(TValue) * t->asize +
+			   sizeof(Node) * (t->hmask + 1);
+  } else if (LJ_LIKELY(o->gch.gct == ~LJ_TFUNC)) {
+    GCfunc *fn = gco2func(o);
+    gc_traverse_func(g, fn);
+    return isluafunc(fn) ? sizeLfunc((MSize)fn->l.nupvalues) :
+			   sizeCfunc((MSize)fn->c.nupvalues);
+  } else if (LJ_LIKELY(o->gch.gct == ~LJ_TPROTO)) {
+    GCproto *pt = gco2pt(o);
+    gc_traverse_proto(g, pt);
+    return sizeof(GCproto) + sizeof(BCIns) * pt->sizebc +
+			     sizeof(GCobj *) * pt->sizekgc +
+			     sizeof(lua_Number) * pt->sizekn +
+			     sizeof(int16_t) * pt->sizeuv +
+			     sizeof(int32_t) * pt->sizelineinfo +
+			     sizeof(VarInfo) * pt->sizevarinfo +
+			     sizeof(GCstr *) * pt->sizeuvname;
+  } else {
+    lua_State *th = gco2th(o);
+    setgcrefr(th->gclist, g->gc.grayagain);
+    setgcref(g->gc.grayagain, o);
+    black2gray(o);  /* Threads are never black. */
+    gc_traverse_thread(g, th);
+    return sizeof(lua_State) + sizeof(TValue) * th->stacksize;
+  }
+}
+
+/* Propagate all gray objects. */
+static size_t gc_propagate_gray(global_State *g)
+{
+  size_t m = 0;
+  while (gcref(g->gc.gray) != NULL)
+    m += propagatemark(g);
+  return m;
+}
+
+/* -- Sweep phase --------------------------------------------------------- */
+
+/* Try to shrink some common data structures. */
+static void gc_shrink(global_State *g, lua_State *L)
+{
+  if (g->strnum <= (g->strmask >> 2) && g->strmask > LJ_MIN_STRTAB*2-1)
+    lj_str_resize(L, g->strmask >> 1);  /* Shrink string table. */
+  if (g->tmpbuf.sz > LJ_MIN_SBUF*2)
+    lj_str_resizebuf(L, &g->tmpbuf, g->tmpbuf.sz >> 1);  /* Shrink temp buf. */
+}
+
+/* Type of GC free functions. */
+typedef void (LJ_FASTCALL *GCFreeFunc)(global_State *g, GCobj *o);
+
+/* GC free functions for LJ_TSTR .. LJ_TUDATA. ORDER LJ_T */
+static const GCFreeFunc gc_freefunc[] = {
+  (GCFreeFunc)lj_str_free,
+  (GCFreeFunc)lj_func_freeuv,
+  (GCFreeFunc)lj_state_free,
+  (GCFreeFunc)lj_func_freeproto,
+  (GCFreeFunc)lj_func_free,
+  (GCFreeFunc)0,
+  (GCFreeFunc)lj_tab_free,
+  (GCFreeFunc)lj_udata_free
+};
+
+/* Full sweep of a GC list. */
+#define gc_fullsweep(g, p)	gc_sweep(g, (p), LJ_MAX_MEM)
+
+/* Partial sweep of a GC list. */
+static GCRef *gc_sweep(global_State *g, GCRef *p, uint32_t lim)
+{
+  /* Mask with other white and LJ_GC_FIXED. Or LJ_GC_SFIXED on shutdown. */
+  int ow = otherwhite(g);
+  GCobj *o;
+  while ((o = gcref(*p)) != NULL && lim-- > 0) {
+    if (o->gch.gct == ~LJ_TTHREAD)  /* Need to sweep open upvalues, too. */
+      gc_fullsweep(g, &gco2th(o)->openupval);
+    if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) {  /* Black or current white? */
+      lua_assert(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED));
+      makewhite(g, o);  /* Value is alive, change to the current white. */
+      p = &o->gch.nextgc;
+    } else {  /* Otherwise value is dead, free it. */
+      lua_assert(isdead(g, o) || ow == LJ_GC_SFIXED);
+      setgcrefr(*p, o->gch.nextgc);
+      if (o == gcref(g->gc.root))
+	setgcrefr(g->gc.root, o->gch.nextgc);  /* Adjust list anchor. */
+      gc_freefunc[o->gch.gct - ~LJ_TSTR](g, o);
+    }
+  }
+  return p;
+}
+
+/* Check whether we can clear a key or a value slot from a table. */
+static int gc_mayclear(cTValue *o, int val)
+{
+  if (tvisgcv(o)) {  /* Only collectable objects can be weak references. */
+    if (tvisstr(o)) {  /* But strings cannot be used as weak references. */
+      gc_mark_str(strV(o));  /* And need to be marked. */
+      return 0;
+    }
+    if (iswhite(gcV(o)))
+      return 1;  /* Object is about to be collected. */
+    if (tvisudata(o) && val && isfinalized(udataV(o)))
+      return 1;  /* Finalized userdata is dropped only from values. */
+  }
+  return 0;  /* Cannot clear. */
+}
+
+/* Clear collected entries from weak tables. */
+static void gc_clearweak(GCobj *o)
+{
+  while (o) {
+    GCtab *t = gco2tab(o);
+    lua_assert((t->marked & LJ_GC_WEAK));
+    if ((t->marked & LJ_GC_WEAKVAL)) {
+      MSize i, asize = t->asize;
+      for (i = 0; i < asize; i++) {
+	/* Clear array slot when value is about to be collected. */
+	TValue *tv = arrayslot(t, i);
+	if (gc_mayclear(tv, 1))
+	  setnilV(tv);
+      }
+    }
+    if (t->hmask > 0) {
+      Node *node = noderef(t->node);
+      MSize i, hmask = t->hmask;
+      for (i = 0; i <= hmask; i++) {
+	Node *n = &node[i];
+	/* Clear hash slot when key or value is about to be collected. */
+	if (!tvisnil(&n->val) && (gc_mayclear(&n->key, 0) ||
+				  gc_mayclear(&n->val, 1))) {
+	  setnilV(&n->val);
+	  if (tvisgcv(&n->key))  /* Leave GC key in, but mark as dead. */
+	    setitype(&n->key, LJ_TDEADKEY);
+	}
+      }
+    }
+    o = gcref(t->gclist);
+  }
+}
+
+/* Finalize one userdata object from mmudata list. */
+static void gc_finalize(lua_State *L)
+{
+  global_State *g = G(L);
+  GCobj *o = gcnext(gcref(g->gc.mmudata));
+  GCudata *ud = gco2ud(o);
+  cTValue *mo;
+  /* Unchain from list of userdata to be finalized. */
+  if (o == gcref(g->gc.mmudata))
+    setgcrefnull(g->gc.mmudata);
+  else
+    setgcrefr(gcref(g->gc.mmudata)->gch.nextgc, ud->nextgc);
+  /* Add it back to the main userdata list and make it white. */
+  setgcrefr(ud->nextgc, mainthread(g)->nextgc);
+  setgcref(mainthread(g)->nextgc, o);
+  makewhite(g, o);
+  /* Resolve the __gc metamethod. */
+  mo = lj_meta_fastg(g, tabref(ud->metatable), MM_gc);
+  if (mo) {
+    /* Save and restore lots of state around the __gc callback. */
+    uint8_t oldh = hook_save(g);
+    MSize oldt = g->gc.threshold;
+    GCobj *oldjl = gcref(g->jit_L);
+    MSize oldjs = 0;
+    ptrdiff_t oldjb = 0;
+    int errcode;
+    TValue *top;
+    if (oldjl) {
+      oldjs = gco2th(oldjl)->stacksize;
+      oldjb = savestack(gco2th(oldjl), mref(g->jit_base, TValue ));
+      setgcrefnull(g->jit_L);
+    }
+    lj_trace_abort(g);
+    top = L->top;
+    L->top = top+2;
+    hook_entergc(g);  /* Disable hooks and new traces during __gc. */
+    g->gc.threshold = LJ_MAX_MEM;  /* Prevent GC steps. */
+    copyTV(L, top, mo);
+    setudataV(L, top+1, ud);
+    errcode = lj_vm_pcall(L, top+1, 1+0, -1);  /* Stack: |mo|ud| -> | */
+    hook_restore(g, oldh);
+    g->gc.threshold = oldt;  /* Restore GC threshold. */
+    if (oldjl) {
+      if (gco2th(oldjl)->stacksize < oldjs)
+	lj_state_growstack(gco2th(oldjl), oldjs - gco2th(oldjl)->stacksize);
+      setgcref(g->jit_L, oldjl);
+      setmref(g->jit_base, restorestack(gco2th(oldjl), oldjb));
+    }
+    if (errcode)
+      lj_err_throw(L, errcode);  /* Propagate errors. */
+  }
+}
+
+/* Finalize all userdata objects from mmudata list. */
+void lj_gc_finalizeudata(lua_State *L)
+{
+  while (gcref(G(L)->gc.mmudata) != NULL)
+    gc_finalize(L);
+}
+
+/* Free all remaining GC objects. */
+void lj_gc_freeall(global_State *g)
+{
+  MSize i, strmask;
+  /* Free everything, except super-fixed objects (the main thread). */
+  g->gc.currentwhite = LJ_GC_WHITES | LJ_GC_SFIXED;
+  gc_fullsweep(g, &g->gc.root);
+  strmask = g->strmask;
+  for (i = 0; i <= strmask; i++)  /* Free all string hash chains. */
+    gc_fullsweep(g, &g->strhash[i]);
+}
+
+/* -- Collector ----------------------------------------------------------- */
+
+/* Atomic part of the GC cycle, transitioning from mark to sweep phase. */
+static void atomic(global_State *g, lua_State *L)
+{
+  size_t udsize;
+
+  gc_mark_uv(g);  /* Need to remark open upvalues (the thread may be dead). */
+  gc_propagate_gray(g);  /* Propagate any left-overs. */
+
+  setgcrefr(g->gc.gray, g->gc.weak);  /* Empty the list of weak tables. */
+  setgcrefnull(g->gc.weak);
+  lua_assert(!iswhite(obj2gco(mainthread(g))));
+  gc_markobj(g, L);  /* Mark running thread. */
+  gc_mark_curtrace(g);  /* Mark current trace. */
+  gc_mark_basemt(g);  /* Mark base metatables (again). */
+  gc_propagate_gray(g);  /* Propagate all of the above. */
+
+  setgcrefr(g->gc.gray, g->gc.grayagain);  /* Empty the 2nd chance list. */
+  setgcrefnull(g->gc.grayagain);
+  gc_propagate_gray(g);  /* Propagate it. */
+
+  udsize = lj_gc_separateudata(g, 0);  /* Separate userdata to be finalized. */
+  gc_mark_mmudata(g);  /* Mark them. */
+  udsize += gc_propagate_gray(g);  /* And propagate the marks. */
+
+  /* All marking done, clear weak tables. */
+  gc_clearweak(gcref(g->gc.weak));
+
+  /* Prepare for sweep phase. */
+  g->gc.currentwhite = cast_byte(otherwhite(g));  /* Flip current white. */
+  g->gc.sweepstr = 0;
+  g->gc.sweep = &g->gc.root;
+  g->gc.state = GCSsweepstring;
+  g->gc.estimate = g->gc.total - (MSize)udsize;  /* Initial estimate. */
+}
+
+/* GC state machine. Returns a cost estimate for each step performed. */
+static size_t gc_onestep(lua_State *L)
+{
+  global_State *g = G(L);
+  switch (g->gc.state) {
+  case GCSpause:
+    gc_mark_start(g);  /* Start a new GC cycle by marking all GC roots. */
+    return 0;
+  case GCSpropagate:
+    if (gcref(g->gc.gray) != NULL)
+      return propagatemark(g);  /* Propagate one gray object. */
+    atomic(g, L);  /* End of mark phase. */
+    return 0;
+  case GCSsweepstring: {
+    MSize old = g->gc.total;
+    gc_fullsweep(g, &g->strhash[g->gc.sweepstr++]);  /* Sweep one chain. */
+    if (g->gc.sweepstr > g->strmask)
+      g->gc.state = GCSsweep;  /* All string hash chains sweeped. */
+    lua_assert(old >= g->gc.total);
+    g->gc.estimate -= old - g->gc.total;
+    return GCSWEEPCOST;
+    }
+  case GCSsweep: {
+    MSize old = g->gc.total;
+    g->gc.sweep = gc_sweep(g, g->gc.sweep, GCSWEEPMAX);  /* Partial sweep. */
+    if (gcref(*g->gc.sweep) == NULL) {
+      gc_shrink(g, L);
+      g->gc.state = GCSfinalize;  /* End of sweep phase. */
+    }
+    lua_assert(old >= g->gc.total);
+    g->gc.estimate -= old - g->gc.total;
+    return GCSWEEPMAX*GCSWEEPCOST;
+    }
+  case GCSfinalize:
+    if (gcref(g->gc.mmudata) != NULL) {
+      gc_finalize(L);  /* Finalize one userdata object. */
+      if (g->gc.estimate > GCFINALIZECOST)
+	g->gc.estimate -= GCFINALIZECOST;
+      return GCFINALIZECOST;
+    }
+    g->gc.state = GCSpause;  /* End of GC cycle. */
+    g->gc.debt = 0;
+    return 0;
+  default:
+    lua_assert(0);
+    return 0;
+  }
+}
+
+/* Perform a limited amount of incremental GC steps. */
+int lj_gc_step(lua_State *L)
+{
+  global_State *g = G(L);
+  MSize lim;
+  int32_t ostate = g->vmstate;
+  setvmstate(g, GC);
+  lim = (GCSTEPSIZE/100) * g->gc.stepmul;
+  if (lim == 0)
+    lim = LJ_MAX_MEM;
+  g->gc.debt += g->gc.total - g->gc.threshold;
+  do {
+    lim -= (MSize)gc_onestep(L);
+    if (g->gc.state == GCSpause) {
+      lua_assert(g->gc.total >= g->gc.estimate);
+      g->gc.threshold = (g->gc.estimate/100) * g->gc.pause;
+      g->vmstate = ostate;
+      return 1;  /* Finished a GC cycle. */
+    }
+  } while ((int32_t)lim > 0);
+  if (g->gc.debt < GCSTEPSIZE) {
+    g->gc.threshold = g->gc.total + GCSTEPSIZE;
+  } else {
+    g->gc.debt -= GCSTEPSIZE;
+    g->gc.threshold = g->gc.total;
+  }
+  g->vmstate = ostate;
+  return 0;
+}
+
+/* Ditto, but fix the stack top first. */
+void lj_gc_step_fixtop(lua_State *L)
+{
+  if (curr_funcisL(L)) L->top = curr_topL(L);
+  lj_gc_step(L);
+}
+
+/* Perform multiple GC steps. Called from JIT-compiled code. */
+void lj_gc_step_jit(lua_State *L, const BCIns *pc, MSize steps)
+{
+  cframe_pc(cframe_raw(L->cframe)) = pc;
+  L->top = curr_topL(L);
+  while (steps-- > 0 && lj_gc_step(L) == 0)
+    ;
+}
+
+/* Perform a full GC cycle. */
+void lj_gc_fullgc(lua_State *L)
+{
+  global_State *g = G(L);
+  int32_t ostate = g->vmstate;
+  setvmstate(g, GC);
+  if (g->gc.state <= GCSpropagate) {  /* Caught somewhere in the middle. */
+    g->gc.sweepstr = 0;
+    g->gc.sweep = &g->gc.root;  /* Sweep everything (preserving it). */
+    setgcrefnull(g->gc.gray);  /* Reset lists from partial propagation. */
+    setgcrefnull(g->gc.grayagain);
+    setgcrefnull(g->gc.weak);
+    g->gc.state = GCSsweepstring;  /* Fast forward to the sweep phase. */
+  }
+  lua_assert(g->gc.state != GCSpause && g->gc.state != GCSpropagate);
+  while (g->gc.state != GCSfinalize) {  /* Finish sweep. */
+    lua_assert(g->gc.state == GCSsweepstring || g->gc.state == GCSsweep);
+    gc_onestep(L);
+  }
+  /* Now perform a full GC. */
+  gc_mark_start(g);
+  while (g->gc.state != GCSpause)
+    gc_onestep(L);
+  g->gc.threshold = (g->gc.estimate/100) * g->gc.pause;
+  g->vmstate = ostate;
+}
+
+/* -- Write barriers ------------------------------------------------------ */
+
+/* Move the GC propagation frontier back for tables (make it gray again). */
+void lj_gc_barrierback(global_State *g, GCtab *t)
+{
+  GCobj *o = obj2gco(t);
+  lua_assert(isblack(o) && !isdead(g, o));
+  lua_assert(g->gc.state != GCSfinalize && g->gc.state != GCSpause);
+  black2gray(o);
+  setgcrefr(t->gclist, g->gc.grayagain);
+  setgcref(g->gc.grayagain, o);
+}
+
+/* Move the GC propagation frontier forward. */
+void lj_gc_barrierf(global_State *g, GCobj *o, GCobj *v)
+{
+  lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+  lua_assert(g->gc.state != GCSfinalize && g->gc.state != GCSpause);
+  lua_assert(o->gch.gct != ~LJ_TTAB);
+  /* Preserve invariant during propagation. Otherwise it doesn't matter. */
+  if (g->gc.state == GCSpropagate)
+    gc_mark(g, v);  /* Move frontier forward. */
+  else
+    makewhite(g, o);  /* Make it white to avoid the following barrier. */
+}
+
+/* The reason for duplicating this is that it needs to be visible from ASM. */
+void lj_gc_barrieruv(global_State *g, GCobj *o, GCobj *v)
+{
+  lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+  lua_assert(g->gc.state != GCSfinalize && g->gc.state != GCSpause);
+  lua_assert(o->gch.gct == ~LJ_TUPVAL);
+  /* Preserve invariant during propagation. Otherwise it doesn't matter. */
+  if (g->gc.state == GCSpropagate)
+    gc_mark(g, v);  /* Move frontier forward. */
+  else
+    makewhite(g, o);  /* Make it white to avoid the following barrier. */
+}
+
+/* Close upvalue. Also needs a write barrier. */
+void lj_gc_closeuv(global_State *g, GCupval *uv)
+{
+  GCobj *o = obj2gco(uv);
+  /* Copy stack slot to upvalue itself and point to the copy. */
+  copyTV(mainthread(g), &uv->tv, uv->v);
+  uv->v = &uv->tv;
+  uv->closed = 1;
+  setgcrefr(o->gch.nextgc, g->gc.root);
+  setgcref(g->gc.root, o);
+  if (isgray(o)) {  /* A closed upvalue is never gray, so fix this. */
+    if (g->gc.state == GCSpropagate) {
+      gray2black(o);  /* Make it black and preserve invariant. */
+      if (tviswhite(uv->v))
+	lj_gc_barrierf(g, o, gcV(uv->v));
+    } else {
+      makewhite(g, o);  /* Make it white, i.e. sweep the upvalue. */
+      lua_assert(g->gc.state != GCSfinalize && g->gc.state != GCSpause);
+    }
+  }
+}
+
+#if LJ_HASJIT
+/* Mark a trace if it's saved during the propagation phase. */
+void lj_gc_barriertrace(global_State *g, void *T)
+{
+  if (g->gc.state == GCSpropagate)
+    gc_traverse_trace(g, (Trace *)T);
+}
+#endif
+
+/* -- Allocator ----------------------------------------------------------- */
+
+/* Call pluggable memory allocator to allocate or resize a fragment. */
+void *lj_mem_realloc(lua_State *L, void *p, MSize osz, MSize nsz)
+{
+  global_State *g = G(L);
+  lua_assert((osz == 0) == (p == NULL));
+  p = g->allocf(g->allocd, p, osz, nsz);
+  if (p == NULL && nsz > 0)
+    lj_err_throw(L, LUA_ERRMEM);
+  lua_assert((nsz == 0) == (p == NULL));
+  g->gc.total = (g->gc.total - osz) + nsz;
+  return p;
+}
+
+/* Allocate new GC object and link it to the root set. */
+void *lj_mem_newgco(lua_State *L, MSize size)
+{
+  global_State *g = G(L);
+  GCobj *o = (GCobj *)g->allocf(g->allocd, NULL, 0, size);
+  if (o == NULL)
+    lj_err_throw(L, LUA_ERRMEM);
+  g->gc.total += size;
+  setgcrefr(o->gch.nextgc, g->gc.root);
+  setgcref(g->gc.root, o);
+  newwhite(g, o);
+  return o;
+}
+
+/* Resize growable vector. */
+void *lj_mem_grow(lua_State *L, void *p, MSize *szp, MSize lim, MSize esz)
+{
+  MSize sz = (*szp) << 1;
+  if (sz < LJ_MIN_VECSZ)
+    sz = LJ_MIN_VECSZ;
+  if (sz > lim)
+    sz = lim;
+  p = lj_mem_realloc(L, p, (*szp)*esz, sz*esz);
+  *szp = sz;
+  return p;
+}
+

+ 102 - 0
src/lj_gc.h

@@ -0,0 +1,102 @@
+/*
+** Garbage collector.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_GC_H
+#define _LJ_GC_H
+
+#include "lj_obj.h"
+
+/* Garbage collector states. Order matters. */
+enum { GCSpause, GCSpropagate, GCSsweepstring, GCSsweep, GCSfinalize };
+
+/* Bitmasks for marked field of GCobj. */
+#define LJ_GC_WHITE0	0x01
+#define LJ_GC_WHITE1	0x02
+#define LJ_GC_BLACK	0x04
+#define LJ_GC_FINALIZED	0x08
+#define LJ_GC_WEAKKEY	0x08
+#define LJ_GC_WEAKVAL	0x10
+#define LJ_GC_FIXED	0x20
+#define LJ_GC_SFIXED	0x40
+
+#define LJ_GC_WHITES	(LJ_GC_WHITE0 | LJ_GC_WHITE1)
+#define LJ_GC_COLORS	(LJ_GC_WHITES | LJ_GC_BLACK)
+#define LJ_GC_WEAK	(LJ_GC_WEAKKEY | LJ_GC_WEAKVAL)
+
+/* Macros to test and set GCobj colors. */
+#define iswhite(x)	((x)->gch.marked & LJ_GC_WHITES)
+#define isblack(x)	((x)->gch.marked & LJ_GC_BLACK)
+#define isgray(x)	(!((x)->gch.marked & (LJ_GC_BLACK|LJ_GC_WHITES)))
+#define tviswhite(x)	(tvisgcv(x) && iswhite(gcV(x)))
+#define otherwhite(g)	(g->gc.currentwhite ^ LJ_GC_WHITES)
+#define isdead(g, v)	((v)->gch.marked & otherwhite(g) & LJ_GC_WHITES)
+
+#define curwhite(g)	((g)->gc.currentwhite & LJ_GC_WHITES)
+#define newwhite(g, x)	(obj2gco(x)->gch.marked = (uint8_t)curwhite(g))
+#define flipwhite(x)	((x)->gch.marked ^= LJ_GC_WHITES)
+#define fixstring(s)	((s)->marked |= LJ_GC_FIXED)
+
+/* Collector. */
+LJ_FUNC size_t lj_gc_separateudata(global_State *g, int all);
+LJ_FUNC void lj_gc_finalizeudata(lua_State *L);
+LJ_FUNC void lj_gc_freeall(global_State *g);
+LJ_FUNCA int lj_gc_step(lua_State *L);
+LJ_FUNCA void lj_gc_step_fixtop(lua_State *L);
+LJ_FUNCA void lj_gc_step_jit(lua_State *L, const BCIns *pc, MSize steps);
+LJ_FUNC void lj_gc_fullgc(lua_State *L);
+
+/* GC check: drive collector forward if the GC threshold has been reached. */
+#define lj_gc_check(L) \
+  { if (LJ_UNLIKELY(G(L)->gc.total >= G(L)->gc.threshold)) \
+      lj_gc_step(L); }
+#define lj_gc_check_fixtop(L) \
+  { if (LJ_UNLIKELY(G(L)->gc.total >= G(L)->gc.threshold)) \
+      lj_gc_step_fixtop(L); }
+
+/* Write barriers. */
+LJ_FUNC void lj_gc_barrierback(global_State *g, GCtab *t);
+LJ_FUNC void lj_gc_barrierf(global_State *g, GCobj *o, GCobj *v);
+LJ_FUNCA void lj_gc_barrieruv(global_State *g, GCobj *o, GCobj *v);
+LJ_FUNC void lj_gc_closeuv(global_State *g, GCupval *uv);
+LJ_FUNC void lj_gc_barriertrace(global_State *g, void *T);
+
+/* Barrier for stores to table objects. TValue and GCobj variant. */
+#define lj_gc_barriert(L, t, tv) \
+  { if (tviswhite(tv) && isblack(obj2gco(t))) \
+      lj_gc_barrierback(G(L), (t)); }
+#define lj_gc_objbarriert(L, t, o)  \
+  { if (iswhite(obj2gco(o)) && isblack(obj2gco(t))) \
+      lj_gc_barrierback(G(L), (t)); }
+
+/* Barrier for stores to any other object. TValue and GCobj variant. */
+#define lj_gc_barrier(L, p, tv) \
+  { if (tviswhite(tv) && isblack(obj2gco(p))) \
+      lj_gc_barrierf(G(L), obj2gco(p), gcV(tv)); }
+#define lj_gc_objbarrier(L, p, o) \
+  { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
+      lj_gc_barrierf(G(L), obj2gco(p), obj2gco(o)); }
+
+/* Allocator. */
+LJ_FUNC void *lj_mem_realloc(lua_State *L, void *p, MSize osz, MSize nsz);
+LJ_FUNC void *lj_mem_newgco(lua_State *L, MSize size);
+LJ_FUNC void *lj_mem_grow(lua_State *L, void *p,
+			  MSize *szp, MSize lim, MSize esz);
+
+#define lj_mem_new(L, s)	lj_mem_realloc(L, NULL, 0, (s))
+#define lj_mem_free(g, p, osize) \
+  (g->gc.total -= (MSize)(osize), g->allocf(g->allocd, (p), (osize), 0))
+
+#define lj_mem_newvec(L, n, t)	((t *)lj_mem_new(L, (MSize)((n)*sizeof(t))))
+#define lj_mem_reallocvec(L, p, on, n, t) \
+  ((p) = (t *)lj_mem_realloc(L, p, (on)*sizeof(t), (MSize)((n)*sizeof(t))))
+#define lj_mem_growvec(L, p, n, m, t) \
+  ((p) = (t *)lj_mem_grow(L, (p), &(n), (m), (MSize)sizeof(t)))
+#define lj_mem_freevec(g, p, n, t)	lj_mem_free(g, (p), (n)*sizeof(t))
+
+#define lj_mem_newobj(L, t)	((t *)lj_mem_newgco(L, sizeof(t)))
+#define lj_mem_newt(L, s, t)	((t *)lj_mem_new(L, (s)))
+#define lj_mem_freet(g, p)	lj_mem_free(g, (p), sizeof(*(p)))
+
+#endif

+ 739 - 0
src/lj_gdbjit.c

@@ -0,0 +1,739 @@
+/*
+** Client for the GDB JIT API.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_gdbjit_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_frame.h"
+#include "lj_jit.h"
+#include "lj_dispatch.h"
+
+/* This is not compiled in by default.
+** Enable with -DLUAJIT_USE_GDBJIT in the Makefile and recompile everything.
+*/
+#ifdef LUAJIT_USE_GDBJIT
+
+/* The GDB JIT API allows JIT compilers to pass debug information about
+** JIT-compiled code back to GDB. You need at least GDB 7.0 or higher
+** to see it in action.
+**
+** This is a passive API, so it works even when not running under GDB
+** or when attaching to an already running process. Alas, this implies
+** enabling it always has a non-negligible overhead -- do not use in
+** release mode!
+**
+** The LuaJIT GDB JIT client is rather minimal at the moment. It gives
+** each trace a symbol name and adds a source location and frame unwind
+** information. Obviously LuaJIT itself and any embedding C application
+** should be compiled with debug symbols, too (see the Makefile).
+**
+** Traces are named TRACE_1, TRACE_2, ... these correspond to the trace
+** numbers from -jv or -jdump. Use "break TRACE_1" or "tbreak TRACE_1" etc.
+** to set breakpoints on specific traces (even ahead of their creation).
+**
+** The source location for each trace allows listing the corresponding
+** source lines with the GDB command "list" (but only if the Lua source
+** has been loaded from a file). Currently this is always set to the
+** location where the trace has been started.
+**
+** Frame unwind information can be inspected with the GDB command
+** "info frame". This also allows proper backtraces across JIT-compiled
+** code with the GDB command "bt".
+**
+** You probably want to add the following settings to a .gdbinit file
+** (or add them to ~/.gdbinit):
+**   set disassembly-flavor intel
+**   set breakpoint pending on
+**
+** Here's a sample GDB session:
+** ------------------------------------------------------------------------
+
+$ cat >x.lua
+for outer=1,100 do
+  for inner=1,100 do end
+end
+^D
+
+$ luajit -jv x.lua
+[TRACE   1 x.lua:2]
+[TRACE   2 (1/3) x.lua:1 -> 1]
+
+$ gdb --quiet --args luajit x.lua
+(gdb) tbreak TRACE_1
+Function "TRACE_1" not defined.
+Temporary breakpoint 1 (TRACE_1) pending.
+(gdb) run
+Starting program: luajit x.lua
+
+Temporary breakpoint 1, TRACE_1 () at x.lua:2
+2	  for inner=1,100 do end
+(gdb) list
+1	for outer=1,100 do
+2	  for inner=1,100 do end
+3	end
+(gdb) bt
+#0  TRACE_1 () at x.lua:2
+#1  0x08053690 in lua_pcall [...]
+[...]
+#7  0x0806ff90 in main [...]
+(gdb) disass TRACE_1
+Dump of assembler code for function TRACE_1:
+0xf7fd9fba <TRACE_1+0>:	mov    DWORD PTR ds:0xf7e0e2a0,0x1
+0xf7fd9fc4 <TRACE_1+10>:	movsd  xmm7,QWORD PTR [edx+0x20]
+[...]
+0xf7fd9ff8 <TRACE_1+62>:	jmp    0xf7fd2014
+End of assembler dump.
+(gdb) tbreak TRACE_2
+Function "TRACE_2" not defined.
+Temporary breakpoint 2 (TRACE_2) pending.
+(gdb) cont
+Continuing.
+
+Temporary breakpoint 2, TRACE_2 () at x.lua:1
+1	for outer=1,100 do
+(gdb) info frame
+Stack level 0, frame at 0xffffd7c0:
+ eip = 0xf7fd9f60 in TRACE_2 (x.lua:1); saved eip 0x8053690
+ called by frame at 0xffffd7e0
+ source language unknown.
+ Arglist at 0xffffd78c, args:
+ Locals at 0xffffd78c, Previous frame's sp is 0xffffd7c0
+ Saved registers:
+  ebx at 0xffffd7ac, ebp at 0xffffd7b8, esi at 0xffffd7b0, edi at 0xffffd7b4,
+  eip at 0xffffd7bc
+(gdb)
+
+** ------------------------------------------------------------------------
+*/
+
+/* -- GDB JIT API --------------------------------------------------------- */
+
+/* GDB JIT actions. */
+enum {
+  GDBJIT_NOACTION = 0,
+  GDBJIT_REGISTER,
+  GDBJIT_UNREGISTER
+};
+
+/* GDB JIT entry. */
+typedef struct GDBJITentry {
+  struct GDBJITentry *next_entry;
+  struct GDBJITentry *prev_entry;
+  const char *symfile_addr;
+  uint64_t symfile_size;
+} GDBJITentry;
+
+/* GDB JIT descriptor. */
+typedef struct GDBJITdesc {
+  uint32_t version;
+  uint32_t action_flag;
+  GDBJITentry *relevant_entry;
+  GDBJITentry *first_entry;
+} GDBJITdesc;
+
+GDBJITdesc __jit_debug_descriptor = {
+  1, GDBJIT_NOACTION, NULL, NULL
+};
+
+/* GDB sets a breakpoint at this function. */
+void LJ_NOINLINE __jit_debug_register_code()
+{
+  __asm__ __volatile__("");
+};
+
+/* -- In-memory ELF object definitions ------------------------------------ */
+
+/* ELF definitions. */
+typedef struct ELFheader {
+  uint8_t emagic[4];
+  uint8_t eclass;
+  uint8_t eendian;
+  uint8_t eversion;
+  uint8_t eosabi;
+  uint8_t eabiversion;
+  uint8_t epad[7];
+  uint16_t type;
+  uint16_t machine;
+  uint32_t version;
+  uintptr_t entry;
+  uintptr_t phofs;
+  uintptr_t shofs;
+  uint32_t flags;
+  uint16_t ehsize;
+  uint16_t phentsize;
+  uint16_t phnum;
+  uint16_t shentsize;
+  uint16_t shnum;
+  uint16_t shstridx;
+} ELFheader;
+
+typedef struct ELFsectheader {
+  uint32_t name;
+  uint32_t type;
+  uintptr_t flags;
+  uintptr_t addr;
+  uintptr_t ofs;
+  uintptr_t size;
+  uint32_t link;
+  uint32_t info;
+  uintptr_t align;
+  uintptr_t entsize;
+} ELFsectheader;
+
+#define ELFSECT_IDX_ABS		0xfff1
+
+enum {
+  ELFSECT_TYPE_PROGBITS = 1,
+  ELFSECT_TYPE_SYMTAB = 2,
+  ELFSECT_TYPE_STRTAB = 3,
+  ELFSECT_TYPE_NOBITS = 8
+};
+
+#define ELFSECT_FLAGS_WRITE	1
+#define ELFSECT_FLAGS_ALLOC	2
+#define ELFSECT_FLAGS_EXEC	4
+
+typedef struct ELFsymbol {
+#if LJ_64
+  uint32_t name;
+  uint8_t info;
+  uint8_t other;
+  uint16_t sectidx;
+  uintptr_t value;
+  uint64_t size;
+#else
+  uint32_t name;
+  uintptr_t value;
+  uint32_t size;
+  uint8_t info;
+  uint8_t other;
+  uint16_t sectidx;
+#endif
+} ELFsymbol;
+
+enum {
+  ELFSYM_TYPE_FUNC = 2,
+  ELFSYM_TYPE_FILE = 4,
+  ELFSYM_BIND_LOCAL = 0 << 4,
+  ELFSYM_BIND_GLOBAL = 1 << 4,
+};
+
+/* DWARF definitions. */
+#define DW_CIE_VERSION	1
+
+enum {
+  DW_CFA_nop = 0x0,
+  DW_CFA_def_cfa = 0xc,
+  DW_CFA_def_cfa_offset = 0xe,
+  DW_CFA_advance_loc = 0x40,
+  DW_CFA_offset = 0x80
+};
+
+enum {
+  DW_EH_PE_udata4 = 3,
+  DW_EH_PE_textrel = 0x20
+};
+
+enum {
+  DW_TAG_compile_unit = 0x11
+};
+
+enum {
+  DW_children_no = 0,
+  DW_children_yes = 1
+};
+
+enum {
+  DW_AT_name = 0x03,
+  DW_AT_stmt_list = 0x10,
+  DW_AT_low_pc = 0x11,
+  DW_AT_high_pc = 0x12
+};
+
+enum {
+  DW_FORM_addr = 0x01,
+  DW_FORM_data4 = 0x06,
+  DW_FORM_string = 0x08
+};
+
+enum {
+  DW_LNS_extended_op = 0,
+  DW_LNS_copy = 1,
+  DW_LNS_advance_pc = 2,
+  DW_LNS_advance_line = 3
+};
+
+enum {
+  DW_LNE_end_sequence = 1,
+  DW_LNE_set_address = 2
+};
+
+enum {
+#if LJ_TARGET_X86
+  DW_REG_AX, DW_REG_CX, DW_REG_DX, DW_REG_BX,
+  DW_REG_SP, DW_REG_BP, DW_REG_SI, DW_REG_DI,
+  DW_REG_RA,
+#elif LJ_TARGET_X64
+  /* Yes, the order is strange, but correct. */
+  DW_REG_AX, DW_REG_DX, DW_REG_CX, DW_REG_BX,
+  DW_REG_SI, DW_REG_DI, DW_REG_BP, DW_REG_SP,
+  DW_REG_8, DW_REG_9, DW_REG_10, DW_REG_11,
+  DW_REG_12, DW_REG_13, DW_REG_14, DW_REG_15,
+  DW_REG_RA,
+#else
+#error "Unsupported target architecture"
+#endif
+};
+
+/* Minimal list of sections for the in-memory ELF object. */
+enum {
+  GDBJIT_SECT_NULL,
+  GDBJIT_SECT_text,
+  GDBJIT_SECT_eh_frame,
+  GDBJIT_SECT_shstrtab,
+  GDBJIT_SECT_strtab,
+  GDBJIT_SECT_symtab,
+  GDBJIT_SECT_debug_info,
+  GDBJIT_SECT_debug_abbrev,
+  GDBJIT_SECT_debug_line,
+  GDBJIT_SECT__MAX
+};
+
+enum {
+  GDBJIT_SYM_UNDEF,
+  GDBJIT_SYM_FILE,
+  GDBJIT_SYM_FUNC,
+  GDBJIT_SYM__MAX
+};
+
+/* In-memory ELF object. */
+typedef struct GDBJITobj {
+  ELFheader hdr;			/* ELF header. */
+  ELFsectheader sect[GDBJIT_SECT__MAX];	/* ELF sections. */
+  ELFsymbol sym[GDBJIT_SYM__MAX];	/* ELF symbol table. */
+  uint8_t space[4096];			/* Space for various section data. */
+} GDBJITobj;
+
+/* Combined structure for GDB JIT entry and ELF object. */
+typedef struct GDBJITentryobj {
+  GDBJITentry entry;
+  size_t sz;
+  GDBJITobj obj;
+} GDBJITentryobj;
+
+/* Template for in-memory ELF header. */
+static const ELFheader elfhdr_template = {
+  .emagic = { 0x7f, 'E', 'L', 'F' },
+  .eclass = LJ_64 ? 2 : 1,
+  .eendian = LJ_ENDIAN_SELECT(1, 2),
+  .eversion = 1,
+#if defined(__linux__)
+  .eosabi = 0,  /* Nope, it's not 3. */
+#elif defined(__FreeBSD__)
+  .eosabi = 9,
+#elif defined(__NetBSD__)
+  .eosabi = 2,
+#elif defined(__OpenBSD__)
+  .eosabi = 12,
+#elif defined(__solaris__)
+  .eosabi = 6,
+#else
+  .eosabi = 0,
+#endif
+  .eabiversion = 0,
+  .epad = { 0, 0, 0, 0, 0, 0, 0 },
+  .type = 1,
+#if LJ_TARGET_X86
+  .machine = 3,
+#elif LJ_TARGET_X64
+  .machine = 62,
+#else
+#error "Unsupported target architecture"
+#endif
+  .version = 1,
+  .entry = 0,
+  .phofs = 0,
+  .shofs = offsetof(GDBJITobj, sect),
+  .flags = 0,
+  .ehsize = sizeof(ELFheader),
+  .phentsize = 0,
+  .phnum = 0,
+  .shentsize = sizeof(ELFsectheader),
+  .shnum = GDBJIT_SECT__MAX,
+  .shstridx = GDBJIT_SECT_shstrtab
+};
+
+/* -- In-memory ELF object generation ------------------------------------- */
+
+/* Context for generating the ELF object for the GDB JIT API. */
+typedef struct GDBJITctx {
+  uint8_t *p;		/* Pointer to next address in obj.space. */
+  uint8_t *startp;	/* Pointer to start address in obj.space. */
+  Trace *T;		/* Generate symbols for this trace. */
+  uintptr_t mcaddr;	/* Machine code address. */
+  MSize szmcode;	/* Size of machine code. */
+  MSize spadjp;		/* Stack adjustment for parent trace or interpreter. */
+  MSize spadj;		/* Stack adjustment for trace itself. */
+  BCLine lineno;	/* Starting line number. */
+  const char *filename;	/* Starting file name. */
+  const char *trname;	/* Name of trace. */
+  size_t objsize;	/* Final size of ELF object. */
+  GDBJITobj obj;	/* In-memory ELF object. */
+} GDBJITctx;
+
+/* Add a zero-terminated string. */
+static uint32_t gdbjit_strz(GDBJITctx *ctx, const char *str)
+{
+  uint8_t *p = ctx->p;
+  uint32_t ofs = (uint32_t)(p - ctx->startp);
+  do {
+    *p++ = (uint8_t)*str;
+  } while (*str++);
+  ctx->p = p;
+  return ofs;
+}
+
+/* Add a ULEB128 value. */
+static void gdbjit_uleb128(GDBJITctx *ctx, uint32_t v)
+{
+  uint8_t *p = ctx->p;
+  for (; v >= 0x80; v >>= 7)
+    *p++ = (uint8_t)((v & 0x7f) | 0x80);
+  *p++ = (uint8_t)v;
+  ctx->p = p;
+}
+
+/* Add a SLEB128 value. */
+static void gdbjit_sleb128(GDBJITctx *ctx, int32_t v)
+{
+  uint8_t *p = ctx->p;
+  for (; (uint32_t)(v+0x40) >= 0x80; v >>= 7)
+    *p++ = (uint8_t)((v & 0x7f) | 0x80);
+  *p++ = (uint8_t)(v & 0x7f);
+  ctx->p = p;
+}
+
+/* Shortcuts to generate DWARF structures. */
+#define DB(x)		(*p++ = (x))
+#define DI8(x)		(*(int8_t *)p = (x), p++)
+#define DU16(x)		(*(uint16_t *)p = (x), p += 2)
+#define DU32(x)		(*(uint32_t *)p = (x), p += 4)
+#define DADDR(x)	(*(uintptr_t *)p = (x), p += sizeof(uintptr_t))
+#define DUV(x)		(ctx->p = p, gdbjit_uleb128(ctx, (x)), p = ctx->p)
+#define DSV(x)		(ctx->p = p, gdbjit_sleb128(ctx, (x)), p = ctx->p)
+#define DSTR(str)	(ctx->p = p, gdbjit_strz(ctx, (str)), p = ctx->p)
+#define DALIGNNOP(s)	while ((uintptr_t)p & ((s)-1)) *p++ = DW_CFA_nop
+#define DSECT(name, stmt) \
+  { uint32_t *szp_##name = (uint32_t *)p; p += 4; stmt \
+    *szp_##name = (uint32_t)((p-(uint8_t *)szp_##name)-4); } \
+
+/* Initialize ELF section headers. */
+static void LJ_FASTCALL gdbjit_secthdr(GDBJITctx *ctx)
+{
+  ELFsectheader *sect;
+
+  *ctx->p++ = '\0';  /* Empty string at start of string table. */
+
+#define SECTDEF(id, tp, al) \
+  sect = &ctx->obj.sect[GDBJIT_SECT_##id]; \
+  sect->name = gdbjit_strz(ctx, "." #id); \
+  sect->type = ELFSECT_TYPE_##tp; \
+  sect->align = (al)
+
+  SECTDEF(text, NOBITS, 16);
+  sect->flags = ELFSECT_FLAGS_ALLOC|ELFSECT_FLAGS_EXEC;
+  sect->addr = ctx->mcaddr;
+  sect->ofs = 0;
+  sect->size = ctx->szmcode;
+
+  SECTDEF(eh_frame, PROGBITS, sizeof(uintptr_t));
+  sect->flags = ELFSECT_FLAGS_ALLOC;
+
+  SECTDEF(shstrtab, STRTAB, 1);
+  SECTDEF(strtab, STRTAB, 1);
+
+  SECTDEF(symtab, SYMTAB, sizeof(uintptr_t));
+  sect->ofs = offsetof(GDBJITobj, sym);
+  sect->size = sizeof(ctx->obj.sym);
+  sect->link = GDBJIT_SECT_strtab;
+  sect->entsize = sizeof(ELFsymbol);
+  sect->info = GDBJIT_SYM_FUNC;
+
+  SECTDEF(debug_info, PROGBITS, 1);
+  SECTDEF(debug_abbrev, PROGBITS, 1);
+  SECTDEF(debug_line, PROGBITS, 1);
+
+#undef SECTDEF
+}
+
+/* Initialize symbol table. */
+static void LJ_FASTCALL gdbjit_symtab(GDBJITctx *ctx)
+{
+  ELFsymbol *sym;
+
+  *ctx->p++ = '\0';  /* Empty string at start of string table. */
+
+  sym = &ctx->obj.sym[GDBJIT_SYM_FILE];
+  sym->name = gdbjit_strz(ctx, "JIT mcode");
+  sym->sectidx = ELFSECT_IDX_ABS;
+  sym->info = ELFSYM_TYPE_FILE|ELFSYM_BIND_LOCAL;
+
+  sym = &ctx->obj.sym[GDBJIT_SYM_FUNC];
+  sym->name = gdbjit_strz(ctx, ctx->trname);
+  sym->sectidx = GDBJIT_SECT_text;
+  sym->value = 0;
+  sym->size = ctx->szmcode;
+  sym->info = ELFSYM_TYPE_FUNC|ELFSYM_BIND_GLOBAL;
+}
+
+/* Initialize .eh_frame section. */
+static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
+{
+  uint8_t *p = ctx->p;
+  uint8_t *framep = p;
+
+  /* Emit DWARF EH CIE. */
+  DSECT(CIE,
+    DU32(0);			/* Offset to CIE itself. */
+    DB(DW_CIE_VERSION);
+    DSTR("zR");			/* Augmentation. */
+    DUV(1);			/* Code alignment factor. */
+    DSV(-(int32_t)sizeof(uintptr_t));  /* Data alignment factor. */
+    DB(DW_REG_RA);		/* Return address register. */
+    DB(1); DB(DW_EH_PE_textrel|DW_EH_PE_udata4);  /* Augmentation data. */
+    DB(DW_CFA_def_cfa); DUV(DW_REG_SP); DUV(sizeof(uintptr_t));
+    DB(DW_CFA_offset|DW_REG_RA); DUV(1);
+    DALIGNNOP(sizeof(uintptr_t));
+  )
+
+  /* Emit DWARF EH FDE. */
+  DSECT(FDE,
+    DU32((uint32_t)(p-framep));	/* Offset to CIE. */
+    DU32(0);			/* Machine code offset relative to .text. */
+    DU32(ctx->szmcode);		/* Machine code length. */
+    DB(0);			/* Augmentation data. */
+    /* Registers saved in CFRAME. */
+#if LJ_TARGET_X86
+    DB(DW_CFA_offset|DW_REG_BP); DUV(2);
+    DB(DW_CFA_offset|DW_REG_DI); DUV(3);
+    DB(DW_CFA_offset|DW_REG_SI); DUV(4);
+    DB(DW_CFA_offset|DW_REG_BX); DUV(5);
+#elif LJ_TARGET_X64
+    /* Add saved registers for x64 CFRAME. */
+#else
+#error "Unsupported target architecture"
+#endif
+    if (ctx->spadjp != ctx->spadj) {  /* Parent/interpreter stack frame size. */
+      DB(DW_CFA_def_cfa_offset); DUV(ctx->spadjp);
+      DB(DW_CFA_advance_loc|1);  /* Only an approximation. */
+    }
+    DB(DW_CFA_def_cfa_offset); DUV(ctx->spadj);  /* Trace stack frame size. */
+    DALIGNNOP(sizeof(uintptr_t));
+  )
+
+  ctx->p = p;
+}
+
+/* Initialize .debug_info section. */
+static void LJ_FASTCALL gdbjit_debuginfo(GDBJITctx *ctx)
+{
+  uint8_t *p = ctx->p;
+
+  DSECT(info,
+    DU16(2);			/* DWARF version. */
+    DU32(0);			/* Abbrev offset. */
+    DB(sizeof(uintptr_t));	/* Pointer size. */
+
+    DUV(1);			/* Abbrev #1: DW_TAG_compile_unit. */
+    DSTR(ctx->filename);	/* DW_AT_name. */
+    DADDR(ctx->mcaddr);		/* DW_AT_low_pc. */
+    DADDR(ctx->mcaddr + ctx->szmcode);  /* DW_AT_high_pc. */
+    DU32(0);			/* DW_AT_stmt_list. */
+  )
+
+  ctx->p = p;
+}
+
+/* Initialize .debug_abbrev section. */
+static void LJ_FASTCALL gdbjit_debugabbrev(GDBJITctx *ctx)
+{
+  uint8_t *p = ctx->p;
+
+  /* Abbrev #1: DW_TAG_compile_unit. */
+  DUV(1); DUV(DW_TAG_compile_unit);
+  DB(DW_children_no);
+  DUV(DW_AT_name);	DUV(DW_FORM_string);
+  DUV(DW_AT_low_pc);	DUV(DW_FORM_addr);
+  DUV(DW_AT_high_pc);	DUV(DW_FORM_addr);
+  DUV(DW_AT_stmt_list);	DUV(DW_FORM_data4);
+  DB(0); DB(0);
+
+  ctx->p = p;
+}
+
+#define DLNE(op, s)	(DB(DW_LNS_extended_op), DUV(1+(s)), DB((op)))
+
+/* Initialize .debug_line section. */
+static void LJ_FASTCALL gdbjit_debugline(GDBJITctx *ctx)
+{
+  uint8_t *p = ctx->p;
+
+  DSECT(line,
+    DU16(2);			/* DWARF version. */
+    DSECT(header,
+      DB(1);			/* Minimum instruction length. */
+      DB(1);			/* is_stmt. */
+      DI8(0);			/* Line base for special opcodes. */
+      DB(2);			/* Line range for special opcodes. */
+      DB(3+1);			/* Opcode base at DW_LNS_advance_line+1. */
+      DB(0); DB(1); DB(1);	/* Standard opcode lengths. */
+      /* Directory table. */
+      DB(0);
+      /* File name table. */
+      DSTR(ctx->filename); DUV(0); DUV(0); DUV(0);
+      DB(0);
+    )
+
+    DLNE(DW_LNE_set_address, sizeof(uintptr_t)); DADDR(ctx->mcaddr);
+    if (ctx->lineno) {
+      DB(DW_LNS_advance_line); DSV(ctx->lineno-1);
+    }
+    DB(DW_LNS_copy);
+    DB(DW_LNS_advance_pc); DUV(ctx->szmcode);
+    DLNE(DW_LNE_end_sequence, 0);
+  )
+
+  ctx->p = p;
+}
+
+#undef DLNE
+
+/* Undef shortcuts. */
+#undef DB
+#undef DI8
+#undef DU16
+#undef DU32
+#undef DADDR
+#undef DUV
+#undef DSV
+#undef DSTR
+#undef DALIGNNOP
+#undef DSECT
+
+/* Type of a section initializer callback. */
+typedef void (LJ_FASTCALL *GDBJITinitf)(GDBJITctx *ctx);
+
+/* Call section initializer and set the section offset and size. */
+static void gdbjit_initsect(GDBJITctx *ctx, int sect, GDBJITinitf initf)
+{
+  ctx->startp = ctx->p;
+  ctx->obj.sect[sect].ofs = (uintptr_t)((char *)ctx->p - (char *)&ctx->obj);
+  initf(ctx);
+  ctx->obj.sect[sect].size = (uintptr_t)(ctx->p - ctx->startp);
+}
+
+#define SECTALIGN(p, a) \
+  ((p) = (uint8_t *)(((uintptr_t)(p) + ((a)-1)) & ~(uintptr_t)((a)-1)))
+
+/* Build in-memory ELF object. */
+static void gdbjit_buildobj(GDBJITctx *ctx)
+{
+  GDBJITobj *obj = &ctx->obj;
+  /* Fill in ELF header and clear structures. */
+  memcpy(&obj->hdr, &elfhdr_template, sizeof(ELFheader));
+  memset(&obj->sect, 0, sizeof(ELFsectheader)*GDBJIT_SECT__MAX);
+  memset(&obj->sym, 0, sizeof(ELFsymbol)*GDBJIT_SYM__MAX);
+  /* Initialize sections. */
+  ctx->p = obj->space;
+  gdbjit_initsect(ctx, GDBJIT_SECT_shstrtab, gdbjit_secthdr);
+  gdbjit_initsect(ctx, GDBJIT_SECT_strtab, gdbjit_symtab);
+  gdbjit_initsect(ctx, GDBJIT_SECT_debug_info, gdbjit_debuginfo);
+  gdbjit_initsect(ctx, GDBJIT_SECT_debug_abbrev, gdbjit_debugabbrev);
+  gdbjit_initsect(ctx, GDBJIT_SECT_debug_line, gdbjit_debugline);
+  SECTALIGN(ctx->p, sizeof(uintptr_t));
+  gdbjit_initsect(ctx, GDBJIT_SECT_eh_frame, gdbjit_ehframe);
+  ctx->objsize = (size_t)((char *)ctx->p - (char *)obj);
+  lua_assert(ctx->objsize < sizeof(GDBJITobj));
+}
+
+#undef SECTALIGN
+
+/* -- Interface to GDB JIT API -------------------------------------------- */
+
+/* Add new entry to GDB JIT symbol chain. */
+static void gdbjit_newentry(lua_State *L, GDBJITctx *ctx)
+{
+  /* Allocate memory for GDB JIT entry and ELF object. */
+  MSize sz = (MSize)(sizeof(GDBJITentryobj) - sizeof(GDBJITobj) + ctx->objsize);
+  GDBJITentryobj *eo = lj_mem_newt(L, sz, GDBJITentryobj);
+  memcpy(&eo->obj, &ctx->obj, ctx->objsize);  /* Copy ELF object. */
+  eo->sz = sz;
+  ctx->T->gdbjit_entry = (void *)eo;
+  /* Link new entry to chain and register it. */
+  eo->entry.prev_entry = NULL;
+  eo->entry.next_entry = __jit_debug_descriptor.first_entry;
+  if (eo->entry.next_entry)
+    eo->entry.next_entry->prev_entry = &eo->entry;
+  eo->entry.symfile_addr = (const char *)&eo->obj;
+  eo->entry.symfile_size = ctx->objsize;
+  __jit_debug_descriptor.first_entry = &eo->entry;
+  __jit_debug_descriptor.relevant_entry = &eo->entry;
+  __jit_debug_descriptor.action_flag = GDBJIT_REGISTER;
+  __jit_debug_register_code();
+}
+
+/* Add debug info for newly compiled trace and notify GDB. */
+void lj_gdbjit_addtrace(jit_State *J, Trace *T, TraceNo traceno)
+{
+  GDBJITctx ctx;
+  lua_State *L = J->L;
+  GCproto *pt = &gcref(T->startpt)->pt;
+  TraceNo parent = T->ir[REF_BASE].op1;
+  uintptr_t pcofs = (uintptr_t)(T->snap[0].mapofs+T->snap[0].nslots);
+  const BCIns *startpc = (const BCIns *)(uintptr_t)T->snapmap[pcofs];
+  ctx.T = T;
+  ctx.mcaddr = (uintptr_t)T->mcode;
+  ctx.szmcode = T->szmcode;
+  ctx.spadjp = CFRAME_SIZE + (MSize)(parent ? J->trace[parent]->spadjust : 0);
+  ctx.spadj = CFRAME_SIZE + T->spadjust;
+  ctx.lineno = pt->lineinfo ? pt->lineinfo[startpc - pt->bc] : 0;
+  ctx.filename = strdata(pt->chunkname);
+  if (*ctx.filename == '@' || *ctx.filename == '=')
+    ctx.filename++;
+  else
+    ctx.filename = "(string)";
+  ctx.trname = lj_str_pushf(L, "TRACE_%d", traceno);
+  L->top--;
+  gdbjit_buildobj(&ctx);
+  gdbjit_newentry(L, &ctx);
+}
+
+/* Delete debug info for trace and notify GDB. */
+void lj_gdbjit_deltrace(jit_State *J, Trace *T)
+{
+  GDBJITentryobj *eo = (GDBJITentryobj *)T->gdbjit_entry;
+  if (eo) {
+    if (eo->entry.prev_entry)
+      eo->entry.prev_entry->next_entry = eo->entry.next_entry;
+    else
+      __jit_debug_descriptor.first_entry = eo->entry.next_entry;
+    if (eo->entry.next_entry)
+      eo->entry.next_entry->prev_entry = eo->entry.prev_entry;
+    __jit_debug_descriptor.relevant_entry = &eo->entry;
+    __jit_debug_descriptor.action_flag = GDBJIT_UNREGISTER;
+    __jit_debug_register_code();
+    lj_mem_free(J2G(J), eo, eo->sz);
+  }
+}
+
+#endif
+#endif

+ 22 - 0
src/lj_gdbjit.h

@@ -0,0 +1,22 @@
+/*
+** Client for the GDB JIT API.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_GDBJIT_H
+#define _LJ_GDBJIT_H
+
+#include "lj_obj.h"
+#include "lj_jit.h"
+
+#if LJ_HASJIT && defined(LUAJIT_USE_GDBJIT)
+
+LJ_FUNC void lj_gdbjit_addtrace(jit_State *J, Trace *T, TraceNo traceno);
+LJ_FUNC void lj_gdbjit_deltrace(jit_State *J, Trace *T);
+
+#else
+#define lj_gdbjit_addtrace(J, T, tn)	UNUSED(T)
+#define lj_gdbjit_deltrace(J, T)	UNUSED(T)
+#endif
+
+#endif

+ 461 - 0
src/lj_ir.c

@@ -0,0 +1,461 @@
+/*
+** SSA IR (Intermediate Representation) emitter.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_ir_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)			(&J->cur.ir[(ref)])
+#define fins			(&J->fold.ins)
+
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
+/* -- IR tables ----------------------------------------------------------- */
+
+/* IR instruction modes. */
+LJ_DATADEF const uint8_t lj_ir_mode[IR__MAX+1] = {
+IRDEF(IRMODE)
+  0
+};
+
+/* -- IR emitter ---------------------------------------------------------- */
+
+/* Grow IR buffer at the top. */
+void LJ_FASTCALL lj_ir_growtop(jit_State *J)
+{
+  IRIns *baseir = J->irbuf + J->irbotlim;
+  MSize szins = J->irtoplim - J->irbotlim;
+  if (szins) {
+    baseir = (IRIns *)lj_mem_realloc(J->L, baseir, szins*sizeof(IRIns),
+				     2*szins*sizeof(IRIns));
+    J->irtoplim = J->irbotlim + 2*szins;
+  } else {
+    baseir = (IRIns *)lj_mem_realloc(J->L, NULL, 0, LJ_MIN_IRSZ*sizeof(IRIns));
+    J->irbotlim = REF_BASE - LJ_MIN_IRSZ/4;
+    J->irtoplim = J->irbotlim + LJ_MIN_IRSZ;
+  }
+  J->cur.ir = J->irbuf = baseir - J->irbotlim;
+}
+
+/* Grow IR buffer at the bottom or shift it up. */
+static void lj_ir_growbot(jit_State *J)
+{
+  IRIns *baseir = J->irbuf + J->irbotlim;
+  MSize szins = J->irtoplim - J->irbotlim;
+  lua_assert(szins != 0);
+  lua_assert(J->cur.nk == J->irbotlim);
+  if (J->cur.nins + (szins >> 1) < J->irtoplim) {
+    /* More than half of the buffer is free on top: shift up by a quarter. */
+    MSize ofs = szins >> 2;
+    memmove(baseir + ofs, baseir, (J->cur.nins - J->irbotlim)*sizeof(IRIns));
+    J->irbotlim -= ofs;
+    J->irtoplim -= ofs;
+    J->cur.ir = J->irbuf = baseir - J->irbotlim;
+  } else {
+    /* Double the buffer size, but split the growth amongst top/bottom. */
+    IRIns *newbase = lj_mem_newt(J->L, 2*szins*sizeof(IRIns), IRIns);
+    MSize ofs = szins >= 256 ? 128 : (szins >> 1);  /* Limit bottom growth. */
+    memcpy(newbase + ofs, baseir, (J->cur.nins - J->irbotlim)*sizeof(IRIns));
+    lj_mem_free(G(J->L), baseir, szins*sizeof(IRIns));
+    J->irbotlim -= ofs;
+    J->irtoplim = J->irbotlim + 2*szins;
+    J->cur.ir = J->irbuf = newbase - J->irbotlim;
+  }
+}
+
+/* Emit IR without any optimizations. */
+TRef LJ_FASTCALL lj_ir_emit(jit_State *J)
+{
+  IRRef ref = lj_ir_nextins(J);
+  IRIns *ir = IR(ref);
+  IROp op = fins->o;
+  ir->prev = J->chain[op];
+  J->chain[op] = (IRRef1)ref;
+  ir->o = op;
+  ir->op1 = fins->op1;
+  ir->op2 = fins->op2;
+  J->guardemit.irt |= fins->t.irt;
+  return TREF(ref, irt_t((ir->t = fins->t)));
+}
+
+/* -- Interning of constants ---------------------------------------------- */
+
+/*
+** IR instructions for constants are kept between J->cur.nk >= ref < REF_BIAS.
+** They are chained like all other instructions, but grow downwards.
+** The are interned (like strings in the VM) to facilitate reference
+** comparisons. The same constant must get the same reference.
+*/
+
+/* Get ref of next IR constant and optionally grow IR.
+** Note: this may invalidate all IRIns *!
+*/
+static LJ_AINLINE IRRef ir_nextk(jit_State *J)
+{
+  IRRef ref = J->cur.nk;
+  if (LJ_UNLIKELY(ref <= J->irbotlim)) lj_ir_growbot(J);
+  J->cur.nk = --ref;
+  return ref;
+}
+
+/* Intern int32_t constant. */
+TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef ref;
+  for (ref = J->chain[IR_KINT]; ref; ref = cir[ref].prev)
+    if (cir[ref].i == k)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  ir->i = k;
+  ir->t.irt = IRT_INT;
+  ir->o = IR_KINT;
+  ir->prev = J->chain[IR_KINT];
+  J->chain[IR_KINT] = (IRRef1)ref;
+found:
+  return TREF(ref, IRT_INT);
+}
+
+/* The MRef inside the KNUM IR instruction holds the address of the constant
+** (an aligned double or a special 64 bit pattern). The KNUM constants
+** themselves are stored in a chained array and shared across traces.
+**
+** Rationale for choosing this data structure:
+** - The address of the constants is embedded in the generated machine code
+**   and must never move. A resizable array or hash table wouldn't work.
+** - Most apps need very few non-integer constants (less than a dozen).
+** - Linear search is hard to beat in terms of speed and low complexity.
+*/
+typedef struct KNumArray {
+  MRef next;			/* Pointer to next list. */
+  MSize numk;			/* Number of used elements in this array. */
+  TValue k[LJ_MIN_KNUMSZ];	/* Array of constants. */
+} KNumArray;
+
+/* Free all chained arrays. */
+void lj_ir_knum_freeall(jit_State *J)
+{
+  KNumArray *kn;
+  for (kn = mref(J->knum, KNumArray); kn; ) {
+    KNumArray *next = mref(kn->next, KNumArray);
+    lj_mem_free(J2G(J), kn, sizeof(KNumArray));
+    kn = next;
+  }
+}
+
+/* Find KNUM constant in chained array or add it. */
+static cTValue *ir_knum_find(jit_State *J, uint64_t nn)
+{
+  KNumArray *kn, *knp = NULL;
+  TValue *ntv;
+  MSize idx;
+  /* Search for the constant in the whole chain of arrays. */
+  for (kn = mref(J->knum, KNumArray); kn; kn = mref(kn->next, KNumArray)) {
+    knp = kn;  /* Remember previous element in list. */
+    for (idx = 0; idx < kn->numk; idx++) {  /* Search one array. */
+      TValue *tv = &kn->k[idx];
+      if (tv->u64 == nn)  /* Needed for +-0/NaN/absmask. */
+	return tv;
+    }
+  }
+  /* Constant was not found, need to add it. */
+  if (!(knp && knp->numk < LJ_MIN_KNUMSZ)) {  /* Allocate a new array. */
+    KNumArray *nkn = lj_mem_newt(J->L, sizeof(KNumArray), KNumArray);
+    setmref(nkn->next, NULL);
+    nkn->numk = 0;
+    if (knp)
+      setmref(knp->next, nkn);  /* Chain to the end of the list. */
+    else
+      setmref(J->knum, nkn);  /* Link first array. */
+    knp = nkn;
+  }
+  ntv = &knp->k[knp->numk++];  /* Add to current array. */
+  ntv->u64 = nn;
+  return ntv;
+}
+
+/* Intern FP constant, given by its address. */
+TRef lj_ir_knum_addr(jit_State *J, cTValue *tv)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef ref;
+  for (ref = J->chain[IR_KNUM]; ref; ref = cir[ref].prev)
+    if (ir_knum(&cir[ref]) == tv)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  setmref(ir->ptr, tv);
+  ir->t.irt = IRT_NUM;
+  ir->o = IR_KNUM;
+  ir->prev = J->chain[IR_KNUM];
+  J->chain[IR_KNUM] = (IRRef1)ref;
+found:
+  return TREF(ref, IRT_NUM);
+}
+
+/* Intern FP constant, given by its 64 bit pattern. */
+TRef lj_ir_knum_nn(jit_State *J, uint64_t nn)
+{
+  return lj_ir_knum_addr(J, ir_knum_find(J, nn));
+}
+
+/* Special 16 byte aligned SIMD constants. */
+LJ_DATADEF LJ_ALIGN(16) cTValue lj_ir_knum_tv[4] = {
+  { U64x(7fffffff,ffffffff) }, { U64x(7fffffff,ffffffff) },
+  { U64x(80000000,00000000) }, { U64x(80000000,00000000) }
+};
+
+/* Check whether a number is int and return it. -0 is NOT considered an int. */
+static int numistrueint(lua_Number n, int32_t *kp)
+{
+  int32_t k = lj_num2int(n);
+  if (n == cast_num(k)) {
+    if (kp) *kp = k;
+    if (k == 0) {  /* Special check for -0. */
+      TValue tv;
+      setnumV(&tv, n);
+      if (tv.u32.hi != 0)
+	return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+/* Intern number as int32_t constant if possible, otherwise as FP constant. */
+TRef lj_ir_knumint(jit_State *J, lua_Number n)
+{
+  int32_t k;
+  if (numistrueint(n, &k))
+    return lj_ir_kint(J, k);
+  else
+    return lj_ir_knum(J, n);
+}
+
+/* Intern GC object "constant". */
+TRef lj_ir_kgc(jit_State *J, GCobj *o, IRType t)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef ref;
+  for (ref = J->chain[IR_KGC]; ref; ref = cir[ref].prev)
+    if (ir_kgc(&cir[ref]) == o)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  /* NOBARRIER: Current trace is a GC root. */
+  setgcref(ir->gcr, o);
+  ir->t.irt = (uint8_t)t;
+  ir->o = IR_KGC;
+  ir->prev = J->chain[IR_KGC];
+  J->chain[IR_KGC] = (IRRef1)ref;
+found:
+  return TREF(ref, t);
+}
+
+/* Intern 32 bit pointer constant. */
+TRef lj_ir_kptr(jit_State *J, void *ptr)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef ref;
+  lua_assert((void *)(intptr_t)i32ptr(ptr) == ptr);
+  for (ref = J->chain[IR_KPTR]; ref; ref = cir[ref].prev)
+    if (mref(cir[ref].ptr, void) == ptr)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  setmref(ir->ptr, ptr);
+  ir->t.irt = IRT_PTR;
+  ir->o = IR_KPTR;
+  ir->prev = J->chain[IR_KPTR];
+  J->chain[IR_KPTR] = (IRRef1)ref;
+found:
+  return TREF(ref, IRT_PTR);
+}
+
+/* Intern typed NULL constant. */
+TRef lj_ir_knull(jit_State *J, IRType t)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef ref;
+  for (ref = J->chain[IR_KNULL]; ref; ref = cir[ref].prev)
+    if (irt_t(cir[ref].t) == t)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  ir->i = 0;
+  ir->t.irt = (uint8_t)t;
+  ir->o = IR_KNULL;
+  ir->prev = J->chain[IR_KNULL];
+  J->chain[IR_KNULL] = (IRRef1)ref;
+found:
+  return TREF(ref, t);
+}
+
+/* Intern key slot. */
+TRef lj_ir_kslot(jit_State *J, TRef key, IRRef slot)
+{
+  IRIns *ir, *cir = J->cur.ir;
+  IRRef2 op12 = IRREF2((IRRef1)key, (IRRef1)slot);
+  IRRef ref;
+  /* Const part is not touched by CSE/DCE, so 0-65535 is ok for IRMlit here. */
+  lua_assert(tref_isk(key) && slot == (IRRef)(IRRef1)slot);
+  for (ref = J->chain[IR_KSLOT]; ref; ref = cir[ref].prev)
+    if (cir[ref].op12 == op12)
+      goto found;
+  ref = ir_nextk(J);
+  ir = IR(ref);
+  ir->op12 = op12;
+  ir->t.irt = IRT_PTR;
+  ir->o = IR_KSLOT;
+  ir->prev = J->chain[IR_KSLOT];
+  J->chain[IR_KSLOT] = (IRRef1)ref;
+found:
+  return TREF(ref, IRT_PTR);
+}
+
+/* -- Access to IR constants ---------------------------------------------- */
+
+/* Copy value of IR constant. */
+void lj_ir_kvalue(lua_State *L, TValue *tv, const IRIns *ir)
+{
+  UNUSED(L);
+  lua_assert(ir->o != IR_KSLOT);  /* Common mistake. */
+  if (irt_isint(ir->t)) {
+    lua_assert(ir->o == IR_KINT);
+    setintV(tv, ir->i);
+  } else if (irt_isnum(ir->t)) {
+    lua_assert(ir->o == IR_KNUM);
+    setnumV(tv, ir_knum(ir)->n);
+  } else if (irt_ispri(ir->t)) {
+    lua_assert(ir->o == IR_KPRI);
+    setitype(tv, irt_toitype(ir->t));
+  } else {
+    if (ir->o == IR_KGC) {
+      lua_assert(irt_isgcv(ir->t));
+      setgcV(L, tv, &ir_kgc(ir)->gch, irt_toitype(ir->t));
+    } else {
+      lua_assert(ir->o == IR_KPTR || ir->o == IR_KNULL);
+      setlightudV(tv, mref(ir->ptr, void));
+    }
+  }
+}
+
+/* -- Convert IR operand types -------------------------------------------- */
+
+/* Convert from integer or string to number. */
+TRef LJ_FASTCALL lj_ir_tonum(jit_State *J, TRef tr)
+{
+  if (!tref_isnum(tr)) {
+    if (tref_isinteger(tr))
+      tr = emitir(IRTN(IR_TONUM), tr, 0);
+    else if (tref_isstr(tr))
+      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+    else
+      lj_trace_err(J, LJ_TRERR_BADTYPE);
+  }
+  return tr;
+}
+
+/* Convert from integer or number to string. */
+TRef LJ_FASTCALL lj_ir_tostr(jit_State *J, TRef tr)
+{
+  if (!tref_isstr(tr)) {
+    if (!tref_isnumber(tr))
+      lj_trace_err(J, LJ_TRERR_BADTYPE);
+    tr = emitir(IRT(IR_TOSTR, IRT_STR), tr, 0);
+  }
+  return tr;
+}
+
+/* Convert from number or string to bitop operand (overflow wrapped). */
+TRef LJ_FASTCALL lj_ir_tobit(jit_State *J, TRef tr)
+{
+  if (!tref_isinteger(tr)) {
+    if (tref_isstr(tr))
+      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+    else if (!tref_isnum(tr))
+      lj_trace_err(J, LJ_TRERR_BADTYPE);
+    tr = emitir(IRTI(IR_TOBIT), tr, lj_ir_knum_tobit(J));
+  }
+  return tr;
+}
+
+/* Convert from number or string to integer (overflow undefined). */
+TRef LJ_FASTCALL lj_ir_toint(jit_State *J, TRef tr)
+{
+  if (!tref_isinteger(tr)) {
+    if (tref_isstr(tr))
+      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+    else if (!tref_isnum(tr))
+      lj_trace_err(J, LJ_TRERR_BADTYPE);
+    tr = emitir(IRTI(IR_TOINT), tr, IRTOINT_ANY);
+  }
+  return tr;
+}
+
+/* -- Miscellaneous IR ops ------------------------------------------------ */
+
+/* Evaluate numeric comparison. */
+int lj_ir_numcmp(lua_Number a, lua_Number b, IROp op)
+{
+  switch (op) {
+  case IR_EQ: return (a == b);
+  case IR_NE: return (a != b);
+  case IR_LT: return (a < b);
+  case IR_GE: return (a >= b);
+  case IR_LE: return (a <= b);
+  case IR_GT: return (a > b);
+  case IR_ULT: return !(a >= b);
+  case IR_UGE: return !(a < b);
+  case IR_ULE: return !(a > b);
+  case IR_UGT: return !(a <= b);
+  default: lua_assert(0); return 0;
+  }
+}
+
+/* Evaluate string comparison. */
+int lj_ir_strcmp(GCstr *a, GCstr *b, IROp op)
+{
+  int res = lj_str_cmp(a, b);
+  switch (op) {
+  case IR_LT: return (res < 0);
+  case IR_GE: return (res >= 0);
+  case IR_LE: return (res <= 0);
+  case IR_GT: return (res > 0);
+  default: lua_assert(0); return 0;
+  }
+}
+
+/* Rollback IR to previous state. */
+void lj_ir_rollback(jit_State *J, IRRef ref)
+{
+  IRRef nins = J->cur.nins;
+  while (nins > ref) {
+    IRIns *ir;
+    nins--;
+    ir = IR(nins);
+    J->chain[ir->o] = ir->prev;
+  }
+  J->cur.nins = nins;
+}
+
+#undef IR
+#undef fins
+#undef emitir
+
+#endif

+ 429 - 0
src/lj_ir.h

@@ -0,0 +1,429 @@
+/*
+** SSA IR (Intermediate Representation) format.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_IR_H
+#define _LJ_IR_H
+
+#include "lj_obj.h"
+
+/* IR instruction definition. Order matters, see below. */
+#define IRDEF(_) \
+  /* Miscellaneous ops. */ \
+  _(NOP,	N , ___, ___) \
+  _(BASE,	N , lit, lit) \
+  _(LOOP,	G , ___, ___) \
+  _(PHI,	S , ref, ref) \
+  _(RENAME,	S , ref, lit) \
+  \
+  /* Constants. */ \
+  _(KPRI,	N , ___, ___) \
+  _(KINT,	N , cst, ___) \
+  _(KGC,	N , cst, ___) \
+  _(KPTR,	N , cst, ___) \
+  _(KNULL,	N , cst, ___) \
+  _(KNUM,	N , cst, ___) \
+  _(KSLOT,	N , ref, lit) \
+  \
+  /* Guarded assertions. */ \
+  /* Must be properly aligned to flip opposites (^1) and (un)ordered (^4). */ \
+  _(EQ,		GC, ref, ref) \
+  _(NE,		GC, ref, ref) \
+  \
+  _(ABC,	G , ref, ref) \
+  _(FRAME,	G , ref, ref) \
+  \
+  _(LT,		G , ref, ref) \
+  _(GE,		G , ref, ref) \
+  _(LE,		G , ref, ref) \
+  _(GT,		G , ref, ref) \
+  \
+  _(ULT,	G , ref, ref) \
+  _(UGE,	G , ref, ref) \
+  _(ULE,	G , ref, ref) \
+  _(UGT,	G , ref, ref) \
+  \
+  /* Bit ops. */ \
+  _(BNOT,	N , ref, ___) \
+  _(BSWAP,	N , ref, ___) \
+  _(BAND,	C , ref, ref) \
+  _(BOR,	C , ref, ref) \
+  _(BXOR,	C , ref, ref) \
+  _(BSHL,	N , ref, ref) \
+  _(BSHR,	N , ref, ref) \
+  _(BSAR,	N , ref, ref) \
+  _(BROL,	N , ref, ref) \
+  _(BROR,	N , ref, ref) \
+  \
+  /* Arithmetic ops. ORDER ARITH (FPMATH/POWI take the space for MOD/POW). */ \
+  _(ADD,	C , ref, ref) \
+  _(SUB,	N , ref, ref) \
+  _(MUL,	C , ref, ref) \
+  _(DIV,	N , ref, ref) \
+  \
+  _(FPMATH,	N , ref, lit) \
+  _(POWI,	N , ref, ref) \
+  \
+  _(NEG,	N , ref, ref) \
+  _(ABS,	N , ref, ref) \
+  _(ATAN2,	N , ref, ref) \
+  _(LDEXP,	N , ref, ref) \
+  _(MIN,	C , ref, ref) \
+  _(MAX,	C , ref, ref) \
+  \
+  /* Overflow-checking arithmetic ops. */ \
+  _(ADDOV,	GC, ref, ref) \
+  _(SUBOV,	G , ref, ref) \
+  \
+  /* Memory ops. A = array, H = hash, U = upvalue, F = field, S = stack. */ \
+  \
+  /* Memory references. */ \
+  _(AREF,	R , ref, ref) \
+  _(HREFK,	RG, ref, ref) \
+  _(HREF,	L , ref, ref) \
+  _(NEWREF,	S , ref, ref) \
+  _(UREFO,	LG, ref, lit) \
+  _(UREFC,	LG, ref, lit) \
+  _(FREF,	R , ref, lit) \
+  _(STRREF,	N , ref, ref) \
+  \
+  /* Loads and Stores. These must be in the same order. */ \
+  _(ALOAD,	LG, ref, ___) \
+  _(HLOAD,	LG, ref, ___) \
+  _(ULOAD,	LG, ref, ___) \
+  _(FLOAD,	L , ref, lit) \
+  _(SLOAD,	LG, lit, lit) \
+  _(XLOAD,	L , ref, lit) \
+  \
+  _(ASTORE,	S , ref, ref) \
+  _(HSTORE,	S , ref, ref) \
+  _(USTORE,	S , ref, ref) \
+  _(FSTORE,	S , ref, ref) \
+  \
+  /* String ops. */ \
+  _(SNEW,	N , ref, ref) \
+  \
+  /* Table ops. */ \
+  _(TNEW,	A , lit, lit) \
+  _(TDUP,	A , ref, ___) \
+  _(TLEN,	L , ref, ___) \
+  _(TBAR,	S , ref, ___) \
+  _(OBAR,	S , ref, ref) \
+  \
+  /* Type conversions. */ \
+  _(TONUM,	N , ref, ___) \
+  _(TOINT,	N , ref, lit) \
+  _(TOBIT,	N , ref, ref) \
+  _(TOSTR,	N , ref, ___) \
+  _(STRTO,	G , ref, ___) \
+  \
+  /* End of list. */
+
+/* IR opcodes (max. 256). */
+typedef enum {
+#define IRENUM(name, m, m1, m2)	IR_##name,
+IRDEF(IRENUM)
+#undef IRENUM
+  IR__MAX
+} IROp;
+
+/* Stored opcode. */
+typedef uint8_t IROp1;
+
+LJ_STATIC_ASSERT(((int)IR_EQ^1) == (int)IR_NE);
+LJ_STATIC_ASSERT(((int)IR_LT^1) == (int)IR_GE);
+LJ_STATIC_ASSERT(((int)IR_LE^1) == (int)IR_GT);
+LJ_STATIC_ASSERT(((int)IR_LT^3) == (int)IR_GT);
+LJ_STATIC_ASSERT(((int)IR_LT^4) == (int)IR_ULT);
+
+/* Delta between xLOAD and xSTORE. */
+#define IRDELTA_L2S		((int)IR_ASTORE - (int)IR_ALOAD)
+
+LJ_STATIC_ASSERT((int)IR_HLOAD + IRDELTA_L2S == (int)IR_HSTORE);
+LJ_STATIC_ASSERT((int)IR_ULOAD + IRDELTA_L2S == (int)IR_USTORE);
+LJ_STATIC_ASSERT((int)IR_FLOAD + IRDELTA_L2S == (int)IR_FSTORE);
+
+/* FPMATH sub-functions. ORDER FPM. */
+#define IRFPMDEF(_) \
+  _(FLOOR) _(CEIL) _(TRUNC)  /* Must be first and in this order. */ \
+  _(SQRT) _(EXP) _(EXP2) _(LOG) _(LOG2) _(LOG10) \
+  _(SIN) _(COS) _(TAN) \
+  _(OTHER)
+
+typedef enum {
+#define FPMENUM(name)		IRFPM_##name,
+IRFPMDEF(FPMENUM)
+#undef FPMENUM
+  IRFPM__MAX
+} IRFPMathOp;
+
+/* FLOAD field IDs. */
+#define IRFLDEF(_) \
+  _(STR_LEN,	GCstr, len) \
+  _(FUNC_ENV,	GCfunc, l.env) \
+  _(TAB_META,	GCtab, metatable) \
+  _(TAB_ARRAY,	GCtab, array) \
+  _(TAB_NODE,	GCtab, node) \
+  _(TAB_ASIZE,	GCtab, asize) \
+  _(TAB_HMASK,	GCtab, hmask) \
+  _(TAB_NOMM,	GCtab, nomm) \
+  _(UDATA_META,	GCudata, metatable)
+
+typedef enum {
+#define FLENUM(name, type, field)	IRFL_##name,
+IRFLDEF(FLENUM)
+#undef FLENUM
+  IRFL__MAX
+} IRFieldID;
+
+/* SLOAD mode bits, stored in op2. */
+#define IRSLOAD_INHERIT		1	/* Inherited by exits/side traces. */
+#define IRSLOAD_READONLY	2	/* Read-only, omit slot store. */
+#define IRSLOAD_PARENT		4	/* Coalesce with parent trace. */
+
+/* XLOAD mode, stored in op2. */
+#define IRXLOAD_UNALIGNED	1
+
+/* TOINT mode, stored in op2. Ordered by strength of the checks. */
+#define IRTOINT_CHECK		0	/* Number checked for integerness. */
+#define IRTOINT_INDEX		1	/* Checked + special backprop rules. */
+#define IRTOINT_ANY		2	/* Any FP number is ok. */
+#define IRTOINT_TOBIT		3	/* Cache only: TOBIT conversion. */
+
+/* IR operand mode (2 bit). */
+typedef enum {
+  IRMref,		/* IR reference. */
+  IRMlit,		/* 16 bit unsigned literal. */
+  IRMcst,		/* Constant literal: i, gcr or ptr. */
+  IRMnone		/* Unused operand. */
+} IRMode;
+#define IRM___		IRMnone
+
+/* Mode bits: Commutative, {Normal/Ref, Alloc, Load, Store}, Guard. */
+#define IRM_C			0x10
+
+#define IRM_N			0x00
+#define IRM_R			IRM_N
+#define IRM_A			0x20
+#define IRM_L			0x40
+#define IRM_S			0x60
+
+#define IRM_G			0x80
+
+#define IRM_GC			(IRM_G|IRM_C)
+#define IRM_RG			(IRM_R|IRM_G)
+#define IRM_LG			(IRM_L|IRM_G)
+
+#define irm_op1(m)		(cast(IRMode, (m)&3))
+#define irm_op2(m)		(cast(IRMode, ((m)>>2)&3))
+#define irm_iscomm(m)		((m) & IRM_C)
+#define irm_kind(m)		((m) & IRM_S)
+#define irm_isguard(m)		((m) & IRM_G)
+/* Stores or any other op with a guard has a side-effect. */
+#define irm_sideeff(m)		((m) >= IRM_S)
+
+#define IRMODE(name, m, m1, m2)	((IRM##m1)|((IRM##m2)<<2)|(IRM_##m)),
+
+LJ_DATA const uint8_t lj_ir_mode[IR__MAX+1];
+
+/* IR result type and flags (8 bit). */
+typedef enum {
+  /* Map of itypes to non-negative numbers. ORDER LJ_T */
+  IRT_NIL,
+  IRT_FALSE,
+  IRT_TRUE,
+  IRT_LIGHTUD,
+  /* GCobj types are from here ... */
+  IRT_STR,
+  IRT_PTR,		/* IRT_PTR never escapes the IR (map of LJ_TUPVAL). */
+  IRT_THREAD,
+  IRT_PROTO,
+  IRT_FUNC,
+  IRT_9,		/* LJ_TDEADKEY is never used in the IR. */
+  IRT_TAB,
+  IRT_UDATA,
+  /* ... until here. */
+  IRT_NUM,
+  /* The various integers are only used in the IR and can only escape to
+  ** a TValue after implicit or explicit conversion (TONUM). Their types
+  ** must be contiguous and next to IRT_NUM (see the typerange macros below).
+  */
+  IRT_INT,
+  IRT_I8,
+  IRT_U8,
+  IRT_I16,
+  IRT_U16,
+  /* There is room for 14 more types. */
+
+  /* Additional flags. */
+  IRT_MARK = 0x20,	/* Marker for misc. purposes. */
+  IRT_GUARD = 0x40,	/* Instruction is a guard. */
+  IRT_ISPHI = 0x80,	/* Instruction is left or right PHI operand. */
+
+  /* Masks. */
+  IRT_TYPE = 0x1f,
+  IRT_T = 0xff
+} IRType;
+
+#define irtype_ispri(irt)	((uint32_t)(irt) <= IRT_TRUE)
+
+/* Stored IRType. */
+typedef struct IRType1 { uint8_t irt; } IRType1;
+
+#define IRT(o, t)		((uint32_t)(((o)<<8) | (t)))
+#define IRTI(o)			(IRT((o), IRT_INT))
+#define IRTN(o)			(IRT((o), IRT_NUM))
+#define IRTG(o, t)		(IRT((o), IRT_GUARD|(t)))
+#define IRTGI(o)		(IRT((o), IRT_GUARD|IRT_INT))
+
+#define irt_t(t)		(cast(IRType, (t).irt))
+#define irt_type(t)		(cast(IRType, (t).irt & IRT_TYPE))
+#define irt_sametype(t1, t2)	((((t1).irt ^ (t2).irt) & IRT_TYPE) == 0)
+#define irt_typerange(t, first, last) \
+  ((uint32_t)((t).irt & IRT_TYPE) - (uint32_t)(first) <= (uint32_t)(last-first))
+
+#define irt_isnil(t)		(irt_type(t) == IRT_NIL)
+#define irt_ispri(t)		((uint32_t)irt_type(t) <= IRT_TRUE)
+#define irt_isstr(t)		(irt_type(t) == IRT_STR)
+#define irt_isfunc(t)		(irt_type(t) == IRT_FUNC)
+#define irt_istab(t)		(irt_type(t) == IRT_TAB)
+#define irt_isnum(t)		(irt_type(t) == IRT_NUM)
+#define irt_isint(t)		(irt_type(t) == IRT_INT)
+#define irt_isi8(t)		(irt_type(t) == IRT_I8)
+#define irt_isu8(t)		(irt_type(t) == IRT_U8)
+#define irt_isi16(t)		(irt_type(t) == IRT_I16)
+#define irt_isu16(t)		(irt_type(t) == IRT_U16)
+
+#define irt_isinteger(t)	(irt_typerange((t), IRT_INT, IRT_U16))
+#define irt_isgcv(t)		(irt_typerange((t), IRT_STR, IRT_UDATA))
+#define irt_isaddr(t)		(irt_typerange((t), IRT_LIGHTUD, IRT_UDATA))
+
+#define itype2irt(tv) \
+  (~uitype(tv) < IRT_NUM ? cast(IRType, ~uitype(tv)) : IRT_NUM)
+#define irt_toitype(t)		((int32_t)~(uint32_t)irt_type(t))
+
+#define irt_isguard(t)		((t).irt & IRT_GUARD)
+#define irt_ismarked(t)		((t).irt & IRT_MARK)
+#define irt_setmark(t)		((t).irt |= IRT_MARK)
+#define irt_clearmark(t)	((t).irt &= ~IRT_MARK)
+#define irt_isphi(t)		((t).irt & IRT_ISPHI)
+#define irt_setphi(t)		((t).irt |= IRT_ISPHI)
+#define irt_clearphi(t)		((t).irt &= ~IRT_ISPHI)
+
+/* Stored combined IR opcode and type. */
+typedef uint16_t IROpT;
+
+/* IR references. */
+typedef uint16_t IRRef1;	/* One stored reference. */
+typedef uint32_t IRRef2;	/* Two stored references. */
+typedef uint32_t IRRef;		/* Used to pass around references. */
+
+/* Fixed references. */
+enum {
+  REF_BIAS =	0x8000,
+  REF_TRUE =	REF_BIAS-3,
+  REF_FALSE =	REF_BIAS-2,
+  REF_NIL =	REF_BIAS-1,	/* \--- Constants grow downwards. */
+  REF_BASE =	REF_BIAS,	/* /--- IR grows upwards. */
+  REF_FIRST =	REF_BIAS+1,
+  REF_DROP =	0xffff
+};
+
+/* Note: IRMlit operands must be < REF_BIAS, too!
+** This allows for fast and uniform manipulation of all operands
+** without looking up the operand mode in lj_ir_mode:
+** - CSE calculates the maximum reference of two operands.
+**   This must work with mixed reference/literal operands, too.
+** - DCE marking only checks for operand >= REF_BIAS.
+** - LOOP needs to substitute reference operands.
+**   Constant references and literals must not be modified.
+*/
+
+#define IRREF2(lo, hi)		((IRRef2)(lo) | ((IRRef2)(hi) << 16))
+
+#define irref_isk(ref)		((ref) < REF_BIAS)
+
+/* Tagged IR references. */
+typedef uint32_t TRef;
+
+#define TREF(ref, t)		(cast(TRef, (ref) + ((t)<<16)))
+
+#define tref_ref(tr)		(cast(IRRef1, (tr)))
+#define tref_t(tr)		(cast(IRType, (tr)>>16))
+#define tref_type(tr)		(cast(IRType, ((tr)>>16) & IRT_TYPE))
+#define tref_typerange(tr, first, last) \
+  ((((tr)>>16) & IRT_TYPE) - (TRef)(first) <= (TRef)(last-first))
+
+#define tref_istype(tr, t)	(((tr) & (IRT_TYPE<<16)) == ((t)<<16))
+#define tref_isnil(tr)		(tref_istype((tr), IRT_NIL))
+#define tref_isfalse(tr)	(tref_istype((tr), IRT_FALSE))
+#define tref_istrue(tr)		(tref_istype((tr), IRT_TRUE))
+#define tref_isstr(tr)		(tref_istype((tr), IRT_STR))
+#define tref_isfunc(tr)		(tref_istype((tr), IRT_FUNC))
+#define tref_istab(tr)		(tref_istype((tr), IRT_TAB))
+#define tref_isudata(tr)	(tref_istype((tr), IRT_UDATA))
+#define tref_isnum(tr)		(tref_istype((tr), IRT_NUM))
+#define tref_isint(tr)		(tref_istype((tr), IRT_INT))
+
+#define tref_isbool(tr)		(tref_typerange((tr), IRT_FALSE, IRT_TRUE))
+#define tref_ispri(tr)		(tref_typerange((tr), IRT_NIL, IRT_TRUE))
+#define tref_istruecond(tr)	(!tref_typerange((tr), IRT_NIL, IRT_FALSE))
+#define tref_isinteger(tr)	(tref_typerange((tr), IRT_INT, IRT_U16))
+#define tref_isnumber(tr)	(tref_typerange((tr), IRT_NUM, IRT_U16))
+#define tref_isnumber_str(tr)	(tref_isnumber((tr)) || tref_isstr((tr)))
+#define tref_isgcv(tr)		(tref_typerange((tr), IRT_STR, IRT_UDATA))
+
+#define tref_isk(tr)		(irref_isk(tref_ref((tr))))
+#define tref_isk2(tr1, tr2)	(irref_isk(tref_ref((tr1) | (tr2))))
+
+#define TREF_PRI(t)		(TREF(REF_NIL-(t), (t)))
+#define TREF_NIL		(TREF_PRI(IRT_NIL))
+#define TREF_FALSE		(TREF_PRI(IRT_FALSE))
+#define TREF_TRUE		(TREF_PRI(IRT_TRUE))
+
+/* IR instruction format (64 bit).
+**
+**    16      16     8   8   8   8
+** +-------+-------+---+---+---+---+
+** |  op1  |  op2  | t | o | r | s |
+** +-------+-------+---+---+---+---+
+** |  op12/i/gco   |   ot  | prev  | (alternative fields in union)
+** +---------------+-------+-------+
+**        32           16      16
+**
+** prev is only valid prior to register allocation and then reused for r + s.
+*/
+
+typedef union IRIns {
+  struct {
+    LJ_ENDIAN_LOHI(
+      IRRef1 op1;	/* IR operand 1. */
+    , IRRef1 op2;	/* IR operand 2. */
+    )
+    IROpT ot;		/* IR opcode and type (overlaps t and o). */
+    IRRef1 prev;	/* Previous ins in same chain (overlaps r and s). */
+  };
+  struct {
+    IRRef2 op12;	/* IR operand 1 and 2 (overlaps op1 and op2). */
+    LJ_ENDIAN_LOHI(
+      IRType1 t;	/* IR type. */
+    , IROp1 o;		/* IR opcode. */
+    )
+    LJ_ENDIAN_LOHI(
+      uint8_t r;	/* Register allocation (overlaps prev). */
+    , uint8_t s;	/* Spill slot allocation (overlaps prev). */
+    )
+  };
+  int32_t i;		/* 32 bit signed integer literal (overlaps op12). */
+  GCRef gcr;		/* GCobj constant (overlaps op12). */
+  MRef ptr;		/* Pointer constant (overlaps op12). */
+} IRIns;
+
+#define ir_kgc(ir)	(gcref((ir)->gcr))
+#define ir_kstr(ir)	(gco2str(ir_kgc((ir))))
+#define ir_ktab(ir)	(gco2tab(ir_kgc((ir))))
+#define ir_kfunc(ir)	(gco2func(ir_kgc((ir))))
+#define ir_knum(ir)	(mref((ir)->ptr, cTValue))
+
+#endif

+ 128 - 0
src/lj_iropt.h

@@ -0,0 +1,128 @@
+/*
+** Common header for IR emitter and optimizations.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_IROPT_H
+#define _LJ_IROPT_H
+
+#include "lj_obj.h"
+#include "lj_jit.h"
+
+#if LJ_HASJIT
+/* IR emitter. */
+LJ_FUNC void LJ_FASTCALL lj_ir_growtop(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_ir_emit(jit_State *J);
+
+/* Save current IR in J->fold.ins, but do not emit it (yet). */
+static LJ_AINLINE void lj_ir_set_(jit_State *J, uint16_t ot, IRRef1 a, IRRef1 b)
+{
+  J->fold.ins.ot = ot; J->fold.ins.op1 = a; J->fold.ins.op2 = b;
+}
+
+#define lj_ir_set(J, ot, a, b) \
+  lj_ir_set_(J, (uint16_t)(ot), (IRRef1)(a), (IRRef1)(b))
+
+/* Get ref of next IR instruction and optionally grow IR.
+** Note: this may invalidate all IRIns*!
+*/
+static LJ_AINLINE IRRef lj_ir_nextins(jit_State *J)
+{
+  IRRef ref = J->cur.nins;
+  if (LJ_UNLIKELY(ref >= J->irtoplim)) lj_ir_growtop(J);
+  J->cur.nins = ref + 1;
+  return ref;
+}
+
+/* Interning of constants. */
+LJ_FUNC TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k);
+LJ_FUNC void lj_ir_knum_freeall(jit_State *J);
+LJ_FUNC TRef lj_ir_knum_addr(jit_State *J, cTValue *tv);
+LJ_FUNC TRef lj_ir_knum_nn(jit_State *J, uint64_t nn);
+LJ_FUNC TRef lj_ir_knumint(jit_State *J, lua_Number n);
+LJ_FUNC TRef lj_ir_kgc(jit_State *J, GCobj *o, IRType t);
+LJ_FUNC TRef lj_ir_kptr(jit_State *J, void *ptr);
+LJ_FUNC TRef lj_ir_knull(jit_State *J, IRType t);
+LJ_FUNC TRef lj_ir_kslot(jit_State *J, TRef key, IRRef slot);
+
+static LJ_AINLINE TRef lj_ir_knum(jit_State *J, lua_Number n)
+{
+  TValue tv;
+  tv.n = n;
+  return lj_ir_knum_nn(J, tv.u64);
+}
+
+#define lj_ir_kstr(J, str)	lj_ir_kgc(J, obj2gco((str)), IRT_STR)
+#define lj_ir_ktab(J, tab)	lj_ir_kgc(J, obj2gco((tab)), IRT_TAB)
+#define lj_ir_kfunc(J, func)	lj_ir_kgc(J, obj2gco((func)), IRT_FUNC)
+
+/* Special FP constants. */
+#define lj_ir_knum_zero(J)	lj_ir_knum_nn(J, U64x(00000000,00000000))
+#define lj_ir_knum_one(J)	lj_ir_knum_nn(J, U64x(3ff00000,00000000))
+#define lj_ir_knum_tobit(J)	lj_ir_knum_nn(J, U64x(43380000,00000000))
+
+/* Special 16 byte aligned SIMD constants. */
+LJ_DATA LJ_ALIGN(16) cTValue lj_ir_knum_tv[4];
+#define lj_ir_knum_abs(J)	lj_ir_knum_addr(J, &lj_ir_knum_tv[0])
+#define lj_ir_knum_neg(J)	lj_ir_knum_addr(J, &lj_ir_knum_tv[2])
+
+/* Access to constants. */
+LJ_FUNC void lj_ir_kvalue(lua_State *L, TValue *tv, const IRIns *ir);
+
+/* Convert IR operand types. */
+LJ_FUNC TRef LJ_FASTCALL lj_ir_tonum(jit_State *J, TRef tr);
+LJ_FUNC TRef LJ_FASTCALL lj_ir_tostr(jit_State *J, TRef tr);
+LJ_FUNC TRef LJ_FASTCALL lj_ir_tobit(jit_State *J, TRef tr);
+LJ_FUNC TRef LJ_FASTCALL lj_ir_toint(jit_State *J, TRef tr);
+
+/* Miscellaneous IR ops. */
+LJ_FUNC int lj_ir_numcmp(lua_Number a, lua_Number b, IROp op);
+LJ_FUNC int lj_ir_strcmp(GCstr *a, GCstr *b, IROp op);
+LJ_FUNC void lj_ir_rollback(jit_State *J, IRRef ref);
+
+/* Emit IR instructions with on-the-fly optimizations. */
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fold(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_cse(jit_State *J);
+
+/* Special return values for the fold functions. */
+enum {
+  NEXTFOLD,		/* Couldn't fold, pass on. */
+  RETRYFOLD,		/* Retry fold with modified fins. */
+  KINTFOLD,		/* Return ref for int constant in fins->i. */
+  FAILFOLD,		/* Guard would always fail. */
+  DROPFOLD,		/* Guard eliminated. */
+  MAX_FOLD
+};
+
+#define INTFOLD(k)	((J->fold.ins.i = (k)), (TRef)KINTFOLD)
+#define CONDFOLD(cond)	((TRef)FAILFOLD + (TRef)(cond))
+#define LEFTFOLD	(J->fold.ins.op1)
+#define RIGHTFOLD	(J->fold.ins.op2)
+#define CSEFOLD		(lj_opt_cse(J))
+#define EMITFOLD	(lj_ir_emit(J))
+
+/* Load/store forwarding. */
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_aload(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_hload(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_uload(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_fload(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_tlen(jit_State *J);
+LJ_FUNC int lj_opt_fwd_wasnonnil(jit_State *J, IROpT loadop, IRRef xref);
+
+/* Dead-store elimination. */
+LJ_FUNC TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_dse_ustore(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_dse_fstore(jit_State *J);
+
+/* Narrowing. */
+LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J);
+LJ_FUNC TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc);
+LJ_FUNC TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc);
+LJ_FUNC IRType lj_opt_narrow_forl(cTValue *forbase);
+
+/* Optimization passes. */
+LJ_FUNC void lj_opt_dce(jit_State *J);
+LJ_FUNC int lj_opt_loop(jit_State *J);
+#endif
+
+#endif

+ 279 - 0
src/lj_jit.h

@@ -0,0 +1,279 @@
+/*
+** Common definitions for the JIT compiler.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_JIT_H
+#define _LJ_JIT_H
+
+#include "lj_obj.h"
+#include "lj_ir.h"
+
+/* JIT engine flags. */
+#define JIT_F_ON		0x00000001
+
+/* CPU-specific JIT engine flags. */
+#if LJ_TARGET_X86ORX64
+#define JIT_F_CMOV		0x00000100
+#define JIT_F_SSE2		0x00000200
+#define JIT_F_SSE4_1		0x00000400
+#define JIT_F_P4		0x00000800
+#define JIT_F_PREFER_IMUL	0x00001000
+#define JIT_F_SPLIT_XMM		0x00002000
+#define JIT_F_LEA_AGU		0x00004000
+
+/* Names for the CPU-specific flags. Must match the order above. */
+#define JIT_F_CPU_FIRST		JIT_F_CMOV
+#define JIT_F_CPUSTRING		"\4CMOV\4SSE2\6SSE4.1\2P4\3AMD\2K8\4ATOM"
+#else
+#error "Missing CPU-specific JIT engine flags"
+#endif
+
+/* Optimization flags. */
+#define JIT_F_OPT_MASK		0x00ff0000
+
+#define JIT_F_OPT_FOLD		0x00010000
+#define JIT_F_OPT_CSE		0x00020000
+#define JIT_F_OPT_DCE		0x00040000
+#define JIT_F_OPT_FWD		0x00080000
+#define JIT_F_OPT_DSE		0x00100000
+#define JIT_F_OPT_NARROW	0x00200000
+#define JIT_F_OPT_LOOP		0x00400000
+#define JIT_F_OPT_FUSE		0x00800000
+
+/* Optimizations names for -O. Must match the order above. */
+#define JIT_F_OPT_FIRST		JIT_F_OPT_FOLD
+#define JIT_F_OPTSTRING	\
+  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\4fuse"
+
+/* Optimization levels set a fixed combination of flags. */
+#define JIT_F_OPT_0	0
+#define JIT_F_OPT_1	(JIT_F_OPT_FOLD|JIT_F_OPT_CSE|JIT_F_OPT_DCE)
+#define JIT_F_OPT_2	(JIT_F_OPT_1|JIT_F_OPT_NARROW|JIT_F_OPT_LOOP)
+#define JIT_F_OPT_3	(JIT_F_OPT_2|JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_FUSE)
+#define JIT_F_OPT_DEFAULT	JIT_F_OPT_3
+
+#ifdef LUA_USE_WIN
+/* See: http://blogs.msdn.com/oldnewthing/archive/2003/10/08/55239.aspx */
+#define JIT_P_sizemcode_DEFAULT		64
+#else
+/* Could go as low as 4K, but the mmap() overhead would be rather high. */
+#define JIT_P_sizemcode_DEFAULT		32
+#endif
+
+/* Optimization parameters and their defaults. Length is a char in octal! */
+#define JIT_PARAMDEF(_) \
+  _(\010, maxtrace,	1000)	/* Max. # of traces in cache. */ \
+  _(\011, maxrecord,	2000)	/* Max. # of recorded IR instructions. */ \
+  _(\012, maxirconst,	500)	/* Max. # of IR constants of a trace. */ \
+  _(\007, maxside,	100)	/* Max. # of side traces of a root trace. */ \
+  _(\007, maxsnap,	100)	/* Max. # of snapshots for a trace. */ \
+  \
+  _(\007, hotloop,	57)	/* # of iterations to detect a hot loop. */ \
+  _(\007, hotexit,	10)	/* # of taken exits to start a side trace. */ \
+  _(\007, tryside,	4)	/* # of attempts to compile a side trace. */ \
+  \
+  _(\012, instunroll,	4)	/* Max. unroll for instable loops. */ \
+  _(\012, loopunroll,	7)	/* Max. unroll for loop ops in side traces. */ \
+  _(\012, callunroll,	3)	/* Max. unroll for recursive calls. */ \
+  _(\011, recunroll,	0)	/* Max. unroll for true recursion. */ \
+  \
+  /* Size of each machine code area (in KBytes). */ \
+  _(\011, sizemcode,	JIT_P_sizemcode_DEFAULT) \
+  /* Max. total size of all machine code areas (in KBytes). */ \
+  _(\010, maxmcode,	512) \
+  /* End of list. */
+
+enum {
+#define JIT_PARAMENUM(len, name, value)	JIT_P_##name,
+JIT_PARAMDEF(JIT_PARAMENUM)
+#undef JIT_PARAMENUM
+  JIT_P__MAX
+};
+
+#define JIT_PARAMSTR(len, name, value)	#len #name
+#define JIT_P_STRING	JIT_PARAMDEF(JIT_PARAMSTR)
+
+/* Trace compiler state. */
+typedef enum {
+  LJ_TRACE_IDLE,	/* Trace compiler idle. */
+  LJ_TRACE_ACTIVE = 0x10,
+  LJ_TRACE_RECORD,	/* Bytecode recording active. */
+  LJ_TRACE_START,	/* New trace started. */
+  LJ_TRACE_END,		/* End of trace. */
+  LJ_TRACE_ASM,		/* Assemble trace. */
+  LJ_TRACE_ERR,		/* Trace aborted with error. */
+} TraceState;
+
+/* Machine code type. */
+typedef uint8_t MCode;
+
+/* Stack snapshot header. */
+typedef struct SnapShot {
+  uint16_t mapofs;	/* Offset into snapshot map. */
+  IRRef1 ref;		/* First IR ref for this snapshot. */
+  uint8_t nslots;	/* Number of stack slots. */
+  uint8_t nframelinks;	/* Number of frame links. */
+  uint8_t count;	/* Count of taken exits for this snapshot. */
+  uint8_t unused1;
+} SnapShot;
+
+#define SNAPCOUNT_DONE	255	/* Already compiled and linked a side trace. */
+#define snap_ref(sn)	((IRRef)(IRRef1)(sn))
+#define snap_ridsp(sn)	((sn) >> 16)
+
+/* Snapshot and exit numbers. */
+typedef uint32_t SnapNo;
+typedef uint32_t ExitNo;
+
+/* Trace number. */
+typedef uint32_t TraceNo;	/* Used to pass around trace numbers. */
+typedef uint16_t TraceNo1;	/* Stored trace number. */
+
+#define TRACE_INTERP	0	/* Fallback to interpreter. */
+
+/* Trace anchor. */
+typedef struct Trace {
+  IRIns *ir;		/* IR instructions/constants. Biased with REF_BIAS. */
+  IRRef nins;		/* Next IR instruction. Biased with REF_BIAS. */
+  IRRef nk;		/* Lowest IR constant. Biased with REF_BIAS. */
+  SnapShot *snap;	/* Snapshot array. */
+  IRRef2 *snapmap;	/* Snapshot map. */
+  uint16_t nsnap;	/* Number of snapshots. */
+  uint16_t nsnapmap;	/* Number of snapshot map elements. */
+  GCRef startpt;	/* Starting prototype. */
+  BCIns startins;	/* Original bytecode of starting instruction. */
+  MCode *mcode;		/* Start of machine code. */
+  MSize szmcode;	/* Size of machine code. */
+  MSize mcloop;		/* Offset of loop start in machine code. */
+  TraceNo1 link;	/* Linked trace (or self for loops). */
+  TraceNo1 root;	/* Root trace of side trace (or 0 for root traces). */
+  TraceNo1 nextroot;	/* Next root trace for same prototype. */
+  TraceNo1 nextside;	/* Next side trace of same root trace. */
+  uint16_t nchild;	/* Number of child traces (root trace only). */
+  uint16_t spadjust;	/* Stack pointer adjustment (offset in bytes). */
+#ifdef LUAJIT_USE_GDBJIT
+  void *gdbjit_entry;	/* GDB JIT entry. */
+#endif
+} Trace;
+
+/* Round-robin penalty cache for bytecodes leading to aborted traces. */
+typedef struct HotPenalty {
+  const BCIns *pc;	/* Starting bytecode PC. */
+  uint16_t val;		/* Penalty value, i.e. hotcount start. */
+  uint16_t reason;	/* Abort reason (really TraceErr). */
+} HotPenalty;
+
+/* Number of slots for the penalty cache. Must be a power of 2. */
+#define PENALTY_SLOTS	16
+
+/* Round-robin backpropagation cache for narrowing conversions. */
+typedef struct BPropEntry {
+  IRRef1 key;		/* Key: original reference. */
+  IRRef1 val;		/* Value: reference after conversion. */
+  IRRef mode;		/* Mode for this entry (currently IRTOINT_*). */
+} BPropEntry;
+
+/* Number of slots for the backpropagation cache. Must be a power of 2. */
+#define BPROP_SLOTS	16
+
+/* Fold state is used to fold instructions on-the-fly. */
+typedef struct FoldState {
+  IRIns ins;		/* Currently emitted instruction. */
+  IRIns left;		/* Instruction referenced by left operand. */
+  IRIns right;		/* Instruction referenced by right operand. */
+} FoldState;
+
+/* JIT compiler state. */
+typedef struct jit_State {
+  Trace cur;		/* Current trace. */
+
+  lua_State *L;		/* Current Lua state. */
+  const BCIns *pc;	/* Current PC. */
+  BCReg maxslot;	/* Relative to baseslot. */
+
+  uint32_t flags;	/* JIT engine flags. */
+  TRef *base;		/* Current frame base, points into J->slots. */
+  BCReg baseslot;	/* Current frame base, offset into J->slots. */
+  GCfunc *fn;		/* Current function. */
+  GCproto *pt;		/* Current prototype. */
+
+  FoldState fold;	/* Fold state. */
+
+  uint8_t mergesnap;	/* Allowed to merge with next snapshot. */
+  uint8_t needsnap;	/* Need snapshot before recording next bytecode. */
+  IRType1 guardemit;	/* Accumulated IRT_GUARD for emitted instructions. */
+  uint8_t unused1;
+
+  const BCIns *bc_min;	/* Start of allowed bytecode range for root trace. */
+  MSize bc_extent;	/* Extent of the range. */
+
+  TraceState state;	/* Trace compiler state. */
+
+  int32_t instunroll;	/* Unroll counter for instable loops. */
+  int32_t loopunroll;	/* Unroll counter for loop ops in side traces. */
+  int32_t tailcalled;	/* Number of successive tailcalls. */
+  int32_t framedepth;	/* Current frame depth. */
+
+  MRef knum;		/* Pointer to chained array of KNUM constants. */
+
+  IRIns *irbuf;		/* Temp. IR instruction buffer. Biased with REF_BIAS. */
+  IRRef irtoplim;	/* Upper limit of instuction buffer (biased). */
+  IRRef irbotlim;	/* Lower limit of instuction buffer (biased). */
+  IRRef loopref;	/* Last loop reference or ref of final LOOP (or 0). */
+
+  SnapShot *snapbuf;	/* Temp. snapshot buffer. */
+  IRRef2 *snapmapbuf;	/* Temp. snapshot map buffer. */
+  MSize sizesnap;	/* Size of temp. snapshot buffer. */
+  MSize sizesnapmap;	/* Size of temp. snapshot map buffer. */
+
+  Trace **trace;	/* Array of traces. */
+  TraceNo curtrace;	/* Current trace number (if not 0). Kept in J->cur. */
+  TraceNo freetrace;	/* Start of scan for next free trace. */
+  MSize sizetrace;	/* Size of trace array. */
+
+  IRRef1 chain[IR__MAX];  /* IR instruction skip-list chain anchors. */
+  TRef slot[LJ_MAX_JSLOTS+LJ_STACK_EXTRA];  /* Stack slot map. */
+
+  int32_t param[JIT_P__MAX];  /* JIT engine parameters. */
+
+  MCode *exitstubgroup[LJ_MAX_EXITSTUBGR];  /* Exit stub group addresses. */
+
+  HotPenalty penalty[PENALTY_SLOTS];  /* Penalty slots. */
+  uint32_t penaltyslot;	/* Round-robin index into penalty slots. */
+
+  BPropEntry bpropcache[BPROP_SLOTS];  /* Backpropagation cache slots. */
+  uint32_t bpropslot;	/* Round-robin index into bpropcache slots. */
+
+  const BCIns *startpc;	/* Bytecode PC of starting instruction. */
+  TraceNo parent;	/* Parent of current side trace (0 for root traces). */
+  ExitNo exitno;	/* Exit number in parent of current side trace. */
+
+  TValue errinfo;	/* Additional info element for trace errors. */
+
+  MCode *mcarea;	/* Base of current mcode area. */
+  MCode *mctop;		/* Top of current mcode area. */
+  MCode *mcbot;		/* Bottom of current mcode area. */
+  size_t szmcarea;	/* Size of current mcode area. */
+  size_t szallmcarea;	/* Total size of all allocated mcode areas. */
+  int mcprot;		/* Protection of current mcode area. */
+} jit_State;
+
+/* Exit stubs. */
+#if LJ_TARGET_X86ORX64
+/* Limited by the range of a short fwd jump (127): (2+2)*(32-1)-2 = 122. */
+#define EXITSTUB_SPACING	(2+2)
+#define EXITSTUBS_PER_GROUP	32
+#else
+#error "Missing CPU-specific exit stub definitions"
+#endif
+
+/* Return the address of an exit stub. */
+static LJ_AINLINE MCode *exitstub_addr(jit_State *J, ExitNo exitno)
+{
+  lua_assert(J->exitstubgroup[exitno / EXITSTUBS_PER_GROUP] != NULL);
+  return J->exitstubgroup[exitno / EXITSTUBS_PER_GROUP] +
+	 EXITSTUB_SPACING*(exitno % EXITSTUBS_PER_GROUP);
+}
+
+#endif

+ 393 - 0
src/lj_lex.c

@@ -0,0 +1,393 @@
+/*
+** Lexical analyzer.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_lex_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_lex.h"
+#include "lj_parse.h"
+#include "lj_ctype.h"
+
+/* Lua lexer token names. */
+static const char *const tokennames[] = {
+#define TKSTR1(name)		#name,
+#define TKSTR2(name, sym)	#sym,
+TKDEF(TKSTR1, TKSTR2)
+#undef TKSTR1
+#undef TKSTR2
+  NULL
+};
+
+/* -- Buffer handling ----------------------------------------------------- */
+
+#define char2int(c)		cast(int, cast(uint8_t, (c)))
+#define next(ls) \
+  (ls->current = (ls->n--) > 0 ? char2int(*ls->p++) : fillbuf(ls))
+#define save_and_next(ls)	(save(ls, ls->current), next(ls))
+#define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
+#define END_OF_STREAM		(-1)
+
+static int fillbuf(LexState *ls)
+{
+  size_t sz;
+  const char *buf = ls->rfunc(ls->L, ls->rdata, &sz);
+  if (buf == NULL || sz == 0) return END_OF_STREAM;
+  ls->n = (MSize)sz - 1;
+  ls->p = buf;
+  return char2int(*(ls->p++));
+}
+
+static void save(LexState *ls, int c)
+{
+  if (ls->sb.n + 1 > ls->sb.sz) {
+    MSize newsize;
+    if (ls->sb.sz >= LJ_MAX_STR/2)
+      lj_lex_error(ls, 0, LJ_ERR_XELEM);
+    newsize = ls->sb.sz * 2;
+    lj_str_resizebuf(ls->L, &ls->sb, newsize);
+  }
+  ls->sb.buf[ls->sb.n++] = cast(char, c);
+}
+
+static int check_next(LexState *ls, const char *set)
+{
+  if (!strchr(set, ls->current))
+    return 0;
+  save_and_next(ls);
+  return 1;
+}
+
+static void inclinenumber(LexState *ls)
+{
+  int old = ls->current;
+  lua_assert(currIsNewline(ls));
+  next(ls);  /* skip `\n' or `\r' */
+  if (currIsNewline(ls) && ls->current != old)
+    next(ls);  /* skip `\n\r' or `\r\n' */
+  if (++ls->linenumber >= LJ_MAX_LINE)
+    lj_lex_error(ls, ls->token, LJ_ERR_XLINES);
+}
+
+/* -- Scanner for terminals ----------------------------------------------- */
+
+static void read_numeral(LexState *ls, TValue *tv)
+{
+  lua_assert(lj_ctype_isdigit(ls->current));
+  do {
+    save_and_next(ls);
+  } while (lj_ctype_isdigit(ls->current) || ls->current == '.');
+  if (check_next(ls, "Ee"))  /* `E'? */
+    check_next(ls, "+-");  /* optional exponent sign */
+  while (lj_ctype_isident(ls->current))
+    save_and_next(ls);
+  save(ls, '\0');
+  if (!lj_str_numconv(ls->sb.buf, tv))
+    lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
+}
+
+static int skip_sep(LexState *ls)
+{
+  int count = 0;
+  int s = ls->current;
+  lua_assert(s == '[' || s == ']');
+  save_and_next(ls);
+  while (ls->current == '=') {
+    save_and_next(ls);
+    count++;
+  }
+  return (ls->current == s) ? count : (-count) - 1;
+}
+
+static void read_long_string(LexState *ls, TValue *tv, int sep)
+{
+  save_and_next(ls);  /* skip 2nd `[' */
+  if (currIsNewline(ls))  /* string starts with a newline? */
+    inclinenumber(ls);  /* skip it */
+  for (;;) {
+    switch (ls->current) {
+    case END_OF_STREAM:
+      lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
+      break;
+    case ']':
+      if (skip_sep(ls) == sep) {
+	save_and_next(ls);  /* skip 2nd `]' */
+	goto endloop;
+      }
+      break;
+    case '\n':
+    case '\r':
+      save(ls, '\n');
+      inclinenumber(ls);
+      if (!tv) lj_str_resetbuf(&ls->sb);  /* avoid wasting space */
+      break;
+    default:
+      if (tv) save_and_next(ls);
+      else next(ls);
+      break;
+    }
+  } endloop:
+  if (tv) {
+    GCstr *str = lj_parse_keepstr(ls, ls->sb.buf + (2 + (MSize)sep),
+				      ls->sb.n - 2*(2 + (MSize)sep));
+    setstrV(ls->L, tv, str);
+  }
+}
+
+static void read_string(LexState *ls, int delim, TValue *tv)
+{
+  save_and_next(ls);
+  while (ls->current != delim) {
+    switch (ls->current) {
+    case END_OF_STREAM:
+      lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
+      continue;
+    case '\n':
+    case '\r':
+      lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
+      continue;
+    case '\\': {
+      int c;
+      next(ls);  /* do not save the `\' */
+      switch (ls->current) {
+      case 'a': c = '\a'; break;
+      case 'b': c = '\b'; break;
+      case 'f': c = '\f'; break;
+      case 'n': c = '\n'; break;
+      case 'r': c = '\r'; break;
+      case 't': c = '\t'; break;
+      case 'v': c = '\v'; break;
+      case '\n': case '\r': save(ls, '\n'); inclinenumber(ls); continue;
+      case END_OF_STREAM: continue;  /* will raise an error next loop */
+      default:
+	if (!lj_ctype_isdigit(ls->current)) {
+	  save_and_next(ls);  /* handles \\, \", \', and \? */
+	} else {  /* \xxx */
+	  int i = 0;
+	  c = 0;
+	  do {
+	    c = 10*c + (ls->current-'0');
+	    next(ls);
+	  } while (++i<3 && lj_ctype_isdigit(ls->current));
+	  if (c > UCHAR_MAX)
+	    lj_lex_error(ls, TK_string, LJ_ERR_XESC);
+	  save(ls, c);
+	}
+	continue;
+      }
+      save(ls, c);
+      next(ls);
+      continue;
+      }
+    default:
+      save_and_next(ls);
+      break;
+    }
+  }
+  save_and_next(ls);  /* skip delimiter */
+  setstrV(ls->L, tv, lj_parse_keepstr(ls, ls->sb.buf + 1, ls->sb.n - 2));
+}
+
+/* -- Main lexical scanner ------------------------------------------------ */
+
+static int llex(LexState *ls, TValue *tv)
+{
+  lj_str_resetbuf(&ls->sb);
+  for (;;) {
+    if (lj_ctype_isident(ls->current)) {
+      GCstr *s;
+      if (lj_ctype_isdigit(ls->current)) {  /* Numeric literal. */
+	read_numeral(ls, tv);
+	return TK_number;
+      }
+      /* Identifier or reserved word. */
+      do {
+	save_and_next(ls);
+      } while (lj_ctype_isident(ls->current));
+      s = lj_parse_keepstr(ls, ls->sb.buf, ls->sb.n);
+      if (s->reserved > 0)  /* Reserved word? */
+	return TK_OFS + s->reserved;
+      setstrV(ls->L, tv, s);
+      return TK_name;
+    }
+    switch (ls->current) {
+    case '\n':
+    case '\r':
+      inclinenumber(ls);
+      continue;
+    case ' ':
+    case '\t':
+    case '\v':
+    case '\f':
+      next(ls);
+      continue;
+    case '-':
+      next(ls);
+      if (ls->current != '-') return '-';
+      /* else is a comment */
+      next(ls);
+      if (ls->current == '[') {
+	int sep = skip_sep(ls);
+	lj_str_resetbuf(&ls->sb);  /* `skip_sep' may dirty the buffer */
+	if (sep >= 0) {
+	  read_long_string(ls, NULL, sep);  /* long comment */
+	  lj_str_resetbuf(&ls->sb);
+	  continue;
+	}
+      }
+      /* else short comment */
+      while (!currIsNewline(ls) && ls->current != END_OF_STREAM)
+	next(ls);
+      continue;
+    case '[': {
+      int sep = skip_sep(ls);
+      if (sep >= 0) {
+	read_long_string(ls, tv, sep);
+	return TK_string;
+      } else if (sep == -1) {
+	return '[';
+      } else {
+	lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
+	continue;
+      }
+      }
+    case '=':
+      next(ls);
+      if (ls->current != '=') return '='; else { next(ls); return TK_eq; }
+    case '<':
+      next(ls);
+      if (ls->current != '=') return '<'; else { next(ls); return TK_le; }
+    case '>':
+      next(ls);
+      if (ls->current != '=') return '>'; else { next(ls); return TK_ge; }
+    case '~':
+      next(ls);
+      if (ls->current != '=') return '~'; else { next(ls); return TK_ne; }
+    case '"':
+    case '\'':
+      read_string(ls, ls->current, tv);
+      return TK_string;
+    case '.':
+      save_and_next(ls);
+      if (check_next(ls, ".")) {
+	if (check_next(ls, "."))
+	  return TK_dots;   /* ... */
+	else
+	  return TK_concat;   /* .. */
+      } else if (!lj_ctype_isdigit(ls->current)) {
+	return '.';
+      } else {
+	read_numeral(ls, tv);
+	return TK_number;
+      }
+    case END_OF_STREAM:
+      return TK_eof;
+    default: {
+      int c = ls->current;
+      next(ls);
+      return c;  /* Single-char tokens (+ - / ...). */
+    }
+    }
+  }
+}
+
+/* -- Lexer API ----------------------------------------------------------- */
+
+void lj_lex_start(lua_State *L, LexState *ls)
+{
+  ls->L = L;
+  ls->fs = NULL;
+  ls->n = 0;
+  ls->p = NULL;
+  ls->lookahead = TK_eof;  /* No look-ahead token. */
+  ls->linenumber = 1;
+  ls->lastline = 1;
+  lj_str_resizebuf(ls->L, &ls->sb, LJ_MIN_SBUF);
+  next(ls);  /* Read-ahead first char. */
+  if (ls->current == 0xef && ls->n >= 2 && char2int(ls->p[0]) == 0xbb &&
+      char2int(ls->p[1]) == 0xbf) {  /* Skip UTF-8 BOM (if buffered). */
+    ls->n -= 2;
+    ls->p += 2;
+    next(ls);
+  }
+  if (ls->current == '#') {  /* Skip POSIX #! header line. */
+    do {
+      next(ls);
+      if (ls->current == END_OF_STREAM) return;
+    } while (!currIsNewline(ls));
+    inclinenumber(ls);
+  }
+  if (ls->current == LUA_SIGNATURE[0]) {
+    setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XBCLOAD));
+    lj_err_throw(L, LUA_ERRSYNTAX);
+  }
+  /* This is an unanchored GCstr before it's stored in the prototype.
+  ** Do this last since next() calls the reader which may call the GC.
+  */
+  ls->chunkname = lj_str_newz(L, ls->chunkarg);
+}
+
+void lj_lex_next(LexState *ls)
+{
+  ls->lastline = ls->linenumber;
+  if (LJ_LIKELY(ls->lookahead == TK_eof)) {  /* No lookahead token? */
+    ls->token = llex(ls, &ls->tokenval);  /* Get next token. */
+  } else {  /* Otherwise return lookahead token. */
+    ls->token = ls->lookahead;
+    ls->lookahead = TK_eof;
+    ls->tokenval = ls->lookaheadval;
+  }
+}
+
+LexToken lj_lex_lookahead(LexState *ls)
+{
+  lua_assert(ls->lookahead == TK_eof);
+  ls->lookahead = llex(ls, &ls->lookaheadval);
+  return ls->lookahead;
+}
+
+const char *lj_lex_token2str(LexState *ls, LexToken token)
+{
+  if (token > TK_OFS)
+    return tokennames[token-TK_OFS-1];
+  else if (!lj_ctype_iscntrl(token))
+    return lj_str_pushf(ls->L, "%c", token);
+  else
+    return lj_str_pushf(ls->L, "char(%d)", token);
+}
+
+void lj_lex_error(LexState *ls, LexToken token, ErrMsg em, ...)
+{
+  const char *tok;
+  va_list argp;
+  if (token == 0) {
+    tok = NULL;
+  } else if (token == TK_name || token == TK_string || token == TK_number) {
+    save(ls, '\0');
+    tok = ls->sb.buf;
+  } else {
+    tok = lj_lex_token2str(ls, token);
+  }
+  va_start(argp, em);
+  lj_err_lex(ls->L, strdata(ls->chunkname), tok, ls->linenumber, em, argp);
+  va_end(argp);
+}
+
+void lj_lex_init(lua_State *L)
+{
+  uint32_t i;
+  for (i = 0; i < TK_RESERVED; i++) {
+    GCstr *s = lj_str_newz(L, tokennames[i]);
+    fixstring(s);  /* Reserved words are never collected. */
+    s->reserved = cast_byte(i+1);
+  }
+}
+

+ 63 - 0
src/lj_lex.h

@@ -0,0 +1,63 @@
+/*
+** Lexical analyzer.
+** Major parts taken verbatim from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#ifndef _LJ_LEX_H
+#define _LJ_LEX_H
+
+#include <stdarg.h>
+
+#include "lj_obj.h"
+#include "lj_err.h"
+
+/* Lua lexer tokens. */
+#define TKDEF(_, __) \
+  _(and) _(break) _(do) _(else) _(elseif) _(end) _(false) \
+  _(for) _(function) _(if) _(in) _(local) _(nil) _(not) _(or) \
+  _(repeat) _(return) _(then) _(true) _(until) _(while) \
+  __(concat, ..) __(dots, ...) __(eq, ==) __(ge, >=) __(le, <=) __(ne, ~=) \
+  __(number, <number>) __(name, <name>) __(string, <string>) __(eof, <eof>)
+
+enum {
+  TK_OFS = 256,
+#define TKENUM1(name)		TK_##name,
+#define TKENUM2(name, sym)	TK_##name,
+TKDEF(TKENUM1, TKENUM2)
+#undef TKENUM1
+#undef TKENUM2
+  TK_RESERVED = TK_while - TK_OFS
+};
+
+typedef int LexToken;
+
+/* Lua lexer state. */
+typedef struct LexState {
+  struct FuncState *fs;	/* Current FuncState. Defined in lj_parse.c. */
+  struct lua_State *L;	/* Lua state. */
+  TValue tokenval;	/* Current token value. */
+  TValue lookaheadval;	/* Lookahead token value. */
+  int current;		/* Current character (charint). */
+  LexToken token;	/* Current token. */
+  LexToken lookahead;	/* Lookahead token. */
+  SBuf sb;		/* String buffer for tokens. */
+  const char *p;	/* Current position in input buffer. */
+  MSize n;		/* Bytes left in input buffer. */
+  lua_Reader rfunc;	/* Reader callback. */
+  void *rdata;		/* Reader callback data. */
+  BCLine linenumber;	/* Input line counter. */
+  BCLine lastline;	/* Line of last token. */
+  GCstr *chunkname;	/* Current chunk name (interned string). */
+  const char *chunkarg;	/* Chunk name argument. */
+  uint32_t level;	/* Syntactical nesting level. */
+} LexState;
+
+LJ_FUNC void lj_lex_start(lua_State *L, LexState *ls);
+LJ_FUNC void lj_lex_next(LexState *ls);
+LJ_FUNC LexToken lj_lex_lookahead(LexState *ls);
+LJ_FUNC const char *lj_lex_token2str(LexState *ls, LexToken token);
+LJ_FUNC_NORET void lj_lex_error(LexState *ls, LexToken token, ErrMsg em, ...);
+LJ_FUNC void lj_lex_init(lua_State *L);
+
+#endif

+ 216 - 0
src/lj_lib.c

@@ -0,0 +1,216 @@
+/*
+** Library function support.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_lib_c
+#define LUA_CORE
+
+#include "lauxlib.h"
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_vm.h"
+#include "lj_lib.h"
+
+/* -- Library initialization ---------------------------------------------- */
+
+static GCtab *lib_create_table(lua_State *L, const char *libname, int hsize)
+{
+  if (libname) {
+    luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 16);
+    lua_getfield(L, -1, libname);
+    if (!tvistab(L->top-1)) {
+      L->top--;
+      if (luaL_findtable(L, LUA_GLOBALSINDEX, libname, hsize) != NULL)
+	lj_err_callerv(L, LJ_ERR_BADMODN, libname);
+      settabV(L, L->top, tabV(L->top-1));
+      L->top++;
+      lua_setfield(L, -3, libname);  /* _LOADED[libname] = new table */
+    }
+    L->top--;
+    settabV(L, L->top-1, tabV(L->top));
+  } else {
+    lua_createtable(L, 0, hsize);
+  }
+  return tabV(L->top-1);
+}
+
+void lj_lib_register(lua_State *L, const char *libname,
+		     const uint8_t *p, const lua_CFunction *cf)
+{
+  GCtab *env = tabref(L->env);
+  GCfunc *ofn = NULL;
+  int ffid = *p++;
+  GCtab *tab = lib_create_table(L, libname, *p++);
+  ptrdiff_t tpos = L->top - L->base;
+
+  /* Avoid barriers further down. */
+  if (isblack(obj2gco(tab))) lj_gc_barrierback(G(L), tab);
+  tab->nomm = 0;
+
+  for (;;) {
+    uint32_t tag = *p++;
+    MSize len = tag & LIBINIT_LENMASK;
+    tag &= LIBINIT_TAGMASK;
+    if (tag != LIBINIT_STRING) {
+      const char *name;
+      MSize nuv = (MSize)(L->top - L->base - tpos);
+      GCfunc *fn = lj_func_newC(L, nuv, env);
+      if (nuv) {
+	L->top = L->base + tpos;
+	memcpy(fn->c.upvalue, L->top, sizeof(TValue)*nuv);
+      }
+      fn->c.ffid = (uint8_t)(ffid++);
+      name = (const char *)p;
+      p += len;
+      if (tag != LIBINIT_CF) {
+	fn->c.gate = makeasmfunc(p[0] + (p[1] << 8));
+	p += 2;
+      }
+      if (tag == LIBINIT_ASM_)
+	fn->c.f = ofn->c.f;  /* Copy handler from previous function. */
+      else
+	fn->c.f = *cf++;  /* Get cf or handler from C function table. */
+      if (len) {
+	/* NOBARRIER: See above for common barrier. */
+	setfuncV(L, lj_tab_setstr(L, tab, lj_str_new(L, name, len)), fn);
+      }
+      ofn = fn;
+    } else {
+      switch (tag | len) {
+      case LIBINIT_SET:
+	L->top -= 2;
+	if (tvisstr(L->top+1) && strV(L->top+1)->len == 0)
+	  env = tabV(L->top);
+	else  /* NOBARRIER: See above for common barrier. */
+	  copyTV(L, lj_tab_set(L, tab, L->top+1), L->top);
+	break;
+      case LIBINIT_NUMBER:
+	memcpy(&L->top->n, p, sizeof(double));
+	L->top++;
+	p += sizeof(double);
+	break;
+      case LIBINIT_COPY:
+	copyTV(L, L->top, L->top - *p++);
+	L->top++;
+	break;
+      case LIBINIT_LASTCL:
+	setfuncV(L, L->top++, ofn);
+	break;
+      case LIBINIT_FFID:
+	ffid++;
+	break;
+      case LIBINIT_END:
+	return;
+      default:
+	setstrV(L, L->top++, lj_str_new(L, (const char *)p, len));
+	p += len;
+	break;
+      }
+    }
+  }
+}
+
+/* -- Type checks --------------------------------------------------------- */
+
+TValue *lj_lib_checkany(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (o >= L->top)
+    lj_err_arg(L, narg, LJ_ERR_NOVAL);
+  return o;
+}
+
+GCstr *lj_lib_checkstr(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (o < L->top) {
+    if (LJ_LIKELY(tvisstr(o))) {
+      return strV(o);
+    } else if (tvisnum(o)) {
+      GCstr *s = lj_str_fromnum(L, &o->n);
+      setstrV(L, o, s);
+      return s;
+    }
+  }
+  lj_err_argt(L, narg, LUA_TSTRING);
+  return NULL;  /* unreachable */
+}
+
+GCstr *lj_lib_optstr(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  return (o < L->top && !tvisnil(o)) ? lj_lib_checkstr(L, narg) : NULL;
+}
+
+lua_Number lj_lib_checknum(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (!(o < L->top &&
+	(tvisnum(o) || (tvisstr(o) && lj_str_numconv(strVdata(o), o)))))
+    lj_err_argt(L, narg, LUA_TNUMBER);
+  return numV(o);
+}
+
+int32_t lj_lib_checkint(lua_State *L, int narg)
+{
+  return lj_num2int(lj_lib_checknum(L, narg));
+}
+
+int32_t lj_lib_optint(lua_State *L, int narg, int32_t def)
+{
+  TValue *o = L->base + narg-1;
+  return (o < L->top && !tvisnil(o)) ? lj_lib_checkint(L, narg) : def;
+}
+
+GCfunc *lj_lib_checkfunc(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (!(o < L->top && tvisfunc(o)))
+    lj_err_argt(L, narg, LUA_TFUNCTION);
+  return funcV(o);
+}
+
+GCtab *lj_lib_checktab(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (!(o < L->top && tvistab(o)))
+    lj_err_argt(L, narg, LUA_TTABLE);
+  return tabV(o);
+}
+
+GCtab *lj_lib_checktabornil(lua_State *L, int narg)
+{
+  TValue *o = L->base + narg-1;
+  if (o < L->top) {
+    if (tvistab(o))
+      return tabV(o);
+    else if (tvisnil(o))
+      return NULL;
+  }
+  lj_err_arg(L, narg, LJ_ERR_NOTABN);
+  return NULL;  /* unreachable */
+}
+
+int lj_lib_checkopt(lua_State *L, int narg, int def, const char *lst)
+{
+  GCstr *s = def >= 0 ? lj_lib_optstr(L, narg) : lj_lib_checkstr(L, narg);
+  if (s) {
+    const char *opt = strdata(s);
+    MSize len = s->len;
+    int i;
+    for (i = 0; *(const uint8_t *)lst; i++) {
+      if (*(const uint8_t *)lst == len && memcmp(opt, lst+1, len) == 0)
+	return i;
+      lst += 1+*(const uint8_t *)lst;
+    }
+    lj_err_argv(L, narg, LJ_ERR_INVOPTM, opt);
+  }
+  return def;
+}
+

+ 84 - 0
src/lj_lib.h

@@ -0,0 +1,84 @@
+/*
+** Library function support.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_LIB_H
+#define _LJ_LIB_H
+
+#include "lj_obj.h"
+
+/*
+** A fallback handler is called by the assembler VM if the fast path fails:
+**
+** - too few arguments:   unrecoverable.
+** - wrong argument type:   recoverable, if coercion succeeds.
+** - bad argument value:  unrecoverable.
+** - stack overflow:        recoverable, if stack reallocation succeeds.
+** - extra handling:        recoverable.
+**
+** The unrecoverable cases throw an error with lj_err_arg(), lj_err_argtype(),
+** lj_err_caller() or lj_err_callermsg().
+** The recoverable cases return 0 or the number of results + 1.
+** The assembler VM retries the fast path only if 0 is returned.
+** This time the fallback must not be called again or it gets stuck in a loop.
+*/
+
+/* Return values from fallback handler. */
+#define FFH_RETRY	0
+#define FFH_UNREACHABLE	FFH_RETRY
+#define FFH_RES(n)	((n)+1)
+
+LJ_FUNC TValue *lj_lib_checkany(lua_State *L, int narg);
+LJ_FUNC GCstr *lj_lib_checkstr(lua_State *L, int narg);
+LJ_FUNC GCstr *lj_lib_optstr(lua_State *L, int narg);
+LJ_FUNC lua_Number lj_lib_checknum(lua_State *L, int narg);
+LJ_FUNC int32_t lj_lib_checkint(lua_State *L, int narg);
+LJ_FUNC int32_t lj_lib_optint(lua_State *L, int narg, int32_t def);
+LJ_FUNC GCfunc *lj_lib_checkfunc(lua_State *L, int narg);
+LJ_FUNC GCtab *lj_lib_checktab(lua_State *L, int narg);
+LJ_FUNC GCtab *lj_lib_checktabornil(lua_State *L, int narg);
+LJ_FUNC int lj_lib_checkopt(lua_State *L, int narg, int def, const char *lst);
+
+#define lj_lib_opt(L, narg, gotarg, noarg) \
+  { TValue *_o = L->base + (narg)-1; \
+    if (_o < L->top && !tvisnil(_o)) { gotarg } else { noarg } }
+
+/* Avoid including lj_frame.h. */
+#define lj_lib_upvalue(L, n) \
+  (&gcref((L->base-1)->fr.func)->fn.c.upvalue[(n)-1])
+
+/* Library function declarations. Scanned by buildvm. */
+#define LJLIB_CF(name)		static int lj_cf_##name(lua_State *L)
+#define LJLIB_ASM(name)		static int lj_ffh_##name(lua_State *L)
+#define LJLIB_ASM_(name)
+#define LJLIB_SET(name)
+#define LJLIB_PUSH(arg)
+#define LJLIB_REC(handler)
+#define LJLIB_NOREGUV
+#define LJLIB_NOREG
+
+#define LJ_LIB_REG(L, name) \
+  lj_lib_register(L, #name, lj_lib_init_##name, lj_lib_cf_##name)
+#define LJ_LIB_REG_(L, regname, name) \
+  lj_lib_register(L, regname, lj_lib_init_##name, lj_lib_cf_##name)
+
+LJ_FUNC void lj_lib_register(lua_State *L, const char *libname,
+			     const uint8_t *init, const lua_CFunction *cf);
+
+/* Library init data tags. */
+#define LIBINIT_LENMASK	0x3f
+#define LIBINIT_TAGMASK	0xc0
+#define LIBINIT_CF	0x00
+#define LIBINIT_ASM	0x40
+#define LIBINIT_ASM_	0x80
+#define LIBINIT_STRING	0xc0
+#define LIBINIT_MAXSTR	0x39
+#define LIBINIT_SET	0xfa
+#define LIBINIT_NUMBER	0xfb
+#define LIBINIT_COPY	0xfc
+#define LIBINIT_LASTCL	0xfd
+#define LIBINIT_FFID	0xfe
+#define LIBINIT_END	0xff
+
+#endif

+ 260 - 0
src/lj_mcode.c

@@ -0,0 +1,260 @@
+/*
+** Machine code management.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_mcode_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_jit.h"
+#include "lj_mcode.h"
+#include "lj_trace.h"
+#include "lj_dispatch.h"
+
+/* -- OS-specific functions ----------------------------------------------- */
+
+#if defined(LUA_USE_WIN)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#define MCPROT_RW	PAGE_READWRITE
+#define MCPROT_RX	PAGE_EXECUTE_READ
+#define MCPROT_RWX	PAGE_EXECUTE_READWRITE
+
+static LJ_AINLINE void *mcode_alloc(jit_State *J, size_t sz, DWORD prot)
+{
+  void *p = VirtualAlloc(NULL, sz, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN, prot);
+  if (!p)
+    lj_trace_err(J, LJ_TRERR_MCODEAL);
+  return p;
+}
+
+static LJ_AINLINE void mcode_free(jit_State *J, void *p, size_t sz)
+{
+  UNUSED(J); UNUSED(sz);
+  VirtualFree(p, 0, MEM_RELEASE);
+}
+
+static LJ_AINLINE void mcode_setprot(void *p, size_t sz, DWORD prot)
+{
+  DWORD oprot;
+  VirtualProtect(p, sz, prot, &oprot);
+}
+
+#elif defined(LUA_USE_POSIX)
+
+#include <sys/mman.h>
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS	MAP_ANON
+#endif
+
+#define MCPROT_RW	(PROT_READ|PROT_WRITE)
+#define MCPROT_RX	(PROT_READ|PROT_EXEC)
+#define MCPROT_RWX	(PROT_READ|PROT_WRITE|PROT_EXEC)
+
+static LJ_AINLINE void *mcode_alloc(jit_State *J, size_t sz, int prot)
+{
+  void *p = mmap(NULL, sz, prot, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (p == MAP_FAILED)
+    lj_trace_err(J, LJ_TRERR_MCODEAL);
+  return p;
+}
+
+static LJ_AINLINE void mcode_free(jit_State *J, void *p, size_t sz)
+{
+  UNUSED(J);
+  munmap(p, sz);
+}
+
+static LJ_AINLINE void mcode_setprot(void *p, size_t sz, int prot)
+{
+  mprotect(p, sz, prot);
+}
+
+#else
+
+/* Fallback allocator. This will fail if memory is not executable by default. */
+#define LUAJIT_UNPROTECT_MCODE
+#define MCPROT_RW	0
+#define MCPROT_RX	0
+#define MCPROT_RWX	0
+
+static LJ_AINLINE void *mcode_alloc(jit_State *J, size_t sz, int prot)
+{
+  UNUSED(prot);
+  return lj_mem_new(J->L, sz);
+}
+
+static LJ_AINLINE void mcode_free(jit_State *J, void *p, size_t sz)
+{
+  lj_mem_free(J2G(J), p, sz);
+}
+
+#define mcode_setprot(p, sz, prot)	UNUSED(p)
+
+#endif
+
+/* -- MCode area management ----------------------------------------------- */
+
+/* Define this ONLY if the page protection twiddling becomes a bottleneck. */
+#ifdef LUAJIT_UNPROTECT_MCODE
+
+/* It's generally considered to be a potential security risk to have
+** pages with simultaneous write *and* execute access in a process.
+**
+** Do not even think about using this mode for server processes or
+** apps handling untrusted external data (such as a browser).
+**
+** The security risk is not in LuaJIT itself -- but if an adversary finds
+** any *other* flaw in your C application logic, then any RWX memory page
+** simplifies writing an exploit considerably.
+*/
+#define MCPROT_GEN	MCPROT_RWX
+#define MCPROT_RUN	MCPROT_RWX
+
+#else
+
+/* This is the default behaviour and much safer:
+**
+** Most of the time the memory pages holding machine code are executable,
+** but NONE of them is writable.
+**
+** The current memory area is marked read-write (but NOT executable) only
+** during the short time window while the assembler generates machine code.
+*/
+#define MCPROT_GEN	MCPROT_RW
+#define MCPROT_RUN	MCPROT_RX
+
+#endif
+
+/* Change protection of MCode area. */
+static void mcode_protect(jit_State *J, int prot)
+{
+#ifdef LUAJIT_UNPROTECT_MCODE
+  UNUSED(J); UNUSED(prot);
+#else
+  if (J->mcprot != prot) {
+    mcode_setprot(J->mcarea, J->szmcarea, prot);
+    J->mcprot = prot;
+  }
+#endif
+}
+
+/* Linked list of MCode areas. */
+typedef struct MCLink {
+  MCode *next;		/* Next area. */
+  size_t size;		/* Size of current area. */
+} MCLink;
+
+/* Allocate a new MCode area. */
+static void mcode_allocarea(jit_State *J)
+{
+  MCode *oldarea = J->mcarea;
+  size_t sz = (size_t)J->param[JIT_P_sizemcode] << 10;
+  sz = (sz + LJ_PAGESIZE-1) & ~(size_t)(LJ_PAGESIZE - 1);
+  J->mcarea = (MCode *)mcode_alloc(J, sz, MCPROT_GEN);
+  J->szmcarea = sz;
+  J->mcprot = MCPROT_GEN;
+  J->mctop = (MCode *)((char *)J->mcarea + J->szmcarea);
+  J->mcbot = (MCode *)((char *)J->mcarea + sizeof(MCLink));
+  ((MCLink *)J->mcarea)->next = oldarea;
+  ((MCLink *)J->mcarea)->size = sz;
+  J->szallmcarea += sz;
+}
+
+/* Free all MCode areas. */
+void lj_mcode_free(jit_State *J)
+{
+  MCode *mc = J->mcarea;
+  J->mcarea = NULL;
+  J->szallmcarea = 0;
+  while (mc) {
+    MCode *next = ((MCLink *)mc)->next;
+    mcode_free(J, mc, ((MCLink *)mc)->size);
+    mc = next;
+  }
+}
+
+/* -- MCode transactions -------------------------------------------------- */
+
+/* Reserve the remainder of the current MCode area. */
+MCode *lj_mcode_reserve(jit_State *J, MCode **lim)
+{
+  if (!J->mcarea)
+    mcode_allocarea(J);
+  else
+    mcode_protect(J, MCPROT_GEN);
+  *lim = J->mcbot;
+  return J->mctop;
+}
+
+/* Commit the top part of the current MCode area. */
+void lj_mcode_commit(jit_State *J, MCode *top)
+{
+  J->mctop = top;
+  mcode_protect(J, MCPROT_RUN);
+}
+
+/* Abort the reservation. */
+void lj_mcode_abort(jit_State *J)
+{
+  mcode_protect(J, MCPROT_RUN);
+}
+
+/* Set/reset protection to allow patching of MCode areas. */
+MCode *lj_mcode_patch(jit_State *J, MCode *ptr, int finish)
+{
+#ifdef LUAJIT_UNPROTECT_MCODE
+  UNUSED(J); UNUSED(ptr); UNUSED(finish);
+  return NULL;
+#else
+  if (finish) {
+    if (J->mcarea == ptr)
+      mcode_protect(J, MCPROT_RUN);
+    else
+      mcode_setprot(ptr, ((MCLink *)ptr)->size, MCPROT_RUN);
+    return NULL;
+  } else {
+    MCode *mc = J->mcarea;
+    /* Try current area first to use the protection cache. */
+    if (ptr >= mc && ptr < mc + J->szmcarea) {
+      mcode_protect(J, MCPROT_GEN);
+      return mc;
+    }
+    /* Otherwise search through the list of MCode areas. */
+    for (;;) {
+      mc = ((MCLink *)mc)->next;
+      lua_assert(mc != NULL);
+      if (ptr >= mc && ptr < mc + ((MCLink *)mc)->size) {
+	mcode_setprot(mc, ((MCLink *)mc)->size, MCPROT_GEN);
+	return mc;
+      }
+    }
+  }
+#endif
+}
+
+/* Limit of MCode reservation reached. */
+void lj_mcode_limiterr(jit_State *J, size_t need)
+{
+  size_t sizemcode, maxmcode;
+  lj_mcode_abort(J);
+  sizemcode = (size_t)J->param[JIT_P_sizemcode] << 10;
+  sizemcode = (sizemcode + LJ_PAGESIZE-1) & ~(size_t)(LJ_PAGESIZE - 1);
+  maxmcode = (size_t)J->param[JIT_P_maxmcode] << 10;
+  if ((size_t)need > sizemcode)
+    lj_trace_err(J, LJ_TRERR_MCODEOV);  /* Too long for any area. */
+  if (J->szallmcarea + sizemcode > maxmcode)
+    lj_trace_err(J, LJ_TRERR_MCODEAL);
+  mcode_allocarea(J);
+  lj_trace_err(J, LJ_TRERR_MCODELM);  /* Retry with new area. */
+}
+
+#endif

+ 23 - 0
src/lj_mcode.h

@@ -0,0 +1,23 @@
+/*
+** Machine code management.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_MCODE_H
+#define _LJ_MCODE_H
+
+#include "lj_jit.h"
+
+#if LJ_HASJIT
+LJ_FUNC void lj_mcode_free(jit_State *J);
+LJ_FUNC MCode *lj_mcode_reserve(jit_State *J, MCode **lim);
+LJ_FUNC void lj_mcode_commit(jit_State *J, MCode *m);
+LJ_FUNC void lj_mcode_abort(jit_State *J);
+LJ_FUNC MCode *lj_mcode_patch(jit_State *J, MCode *ptr, int finish);
+LJ_FUNC_NORET void lj_mcode_limiterr(jit_State *J, size_t need);
+
+#define lj_mcode_commitbot(J, m)	(J->mcbot = (m))
+
+#endif
+
+#endif

+ 358 - 0
src/lj_meta.c

@@ -0,0 +1,358 @@
+/*
+** Metamethod handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_meta_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_meta.h"
+#include "lj_bc.h"
+#include "lj_vm.h"
+
+/* -- Metamethod handling ------------------------------------------------- */
+
+/* String interning of metamethod names for fast indexing. */
+void lj_meta_init(lua_State *L)
+{
+#define MMNAME(name)	"__" #name
+  const char *metanames = MMDEF(MMNAME);
+#undef MMNAME
+  global_State *g = G(L);
+  const char *p, *q;
+  uint32_t i;
+  for (i = 0, p = metanames; *p; i++, p = q) {
+    GCstr *s;
+    for (q = p+2; *q && *q != '_'; q++) ;
+    s = lj_str_new(L, p, (size_t)(q-p));
+    fixstring(s);  /* Never collect these names. */
+    /* NOBARRIER: g->mmname[] is a GC root. */
+    setgcref(g->mmname[i], obj2gco(s));
+  }
+}
+
+/* Negative caching of a few fast metamethods. See the lj_meta_fast() macro. */
+cTValue *lj_meta_cache(GCtab *mt, MMS mm, GCstr *name)
+{
+  cTValue *mo = lj_tab_getstr(mt, name);
+  lua_assert(mm <= MM_FAST);
+  if (!mo || tvisnil(mo)) {  /* No metamethod? */
+    mt->nomm |= cast_byte(1u<<mm);  /* Set negative cache flag. */
+    return NULL;
+  }
+  return mo;
+}
+
+/* Lookup metamethod for object. */
+cTValue *lj_meta_lookup(lua_State *L, cTValue *o, MMS mm)
+{
+  GCtab *mt;
+  if (tvistab(o))
+    mt = tabref(tabV(o)->metatable);
+  else if (tvisudata(o))
+    mt = tabref(udataV(o)->metatable);
+  else
+    mt = tabref(G(L)->basemt[itypemap(o)]);
+  if (mt) {
+    cTValue *mo = lj_tab_getstr(mt, strref(G(L)->mmname[mm]));
+    if (mo)
+      return mo;
+  }
+  return niltv(L);
+}
+
+/* Setup call to metamethod to be run by Assembler VM. */
+static TValue *mmcall(lua_State *L, ASMFunction cont, cTValue *mo,
+		    cTValue *a, cTValue *b)
+{
+  /*
+  **           |-- framesize -> top       top+1       top+2 top+3
+  ** before:   [func slots ...]
+  ** mm setup: [func slots ...] [cont|?]  [mo|tmtype] [a]   [b]
+  ** in asm:   [func slots ...] [cont|PC] [mo|delta]  [a]   [b]
+  **           ^-- func base                          ^-- mm base
+  ** after mm: [func slots ...]           [result]
+  **                ^-- copy to base[PC_RA] --/     for lj_cont_ra
+  **                          istruecond + branch   for lj_cont_cond*
+  **                                       ignore   for lj_cont_nop
+  ** next PC:  [func slots ...]
+  */
+  TValue *top = L->top;
+  if (curr_funcisL(L)) top = curr_topL(L);
+  setcont(top, cont);  /* Assembler VM stores PC in upper word. */
+  copyTV(L, top+1, mo);  /* Store metamethod and two arguments. */
+  copyTV(L, top+2, a);
+  copyTV(L, top+3, b);
+  return top+2;  /* Return new base. */
+}
+
+/* -- C helpers for some instructions, called from assembler VM ----------- */
+
+/* Helper for TGET*. __index chain and metamethod. */
+cTValue *lj_meta_tget(lua_State *L, cTValue *o, cTValue *k)
+{
+  int loop;
+  for (loop = 0; loop < LJ_MAX_IDXCHAIN; loop++) {
+    cTValue *mo;
+    if (tvistab(o)) {
+      GCtab *t = tabV(o);
+      cTValue *tv = lj_tab_get(L, t, k);
+      if (!tvisnil(tv) ||
+	  !(mo = lj_meta_fast(L, tabref(t->metatable), MM_index)))
+	return tv;
+    } else if (tvisnil(mo = lj_meta_lookup(L, o, MM_index))) {
+      lj_err_optype(L, o, LJ_ERR_OPINDEX);
+      return NULL;  /* unreachable */
+    }
+    if (tvisfunc(mo)) {
+      L->top = mmcall(L, lj_cont_ra, mo, o, k);
+      return NULL;  /* Trigger metamethod call. */
+    }
+    o = mo;
+  }
+  lj_err_msg(L, LJ_ERR_GETLOOP);
+  return NULL;  /* unreachable */
+}
+
+/* Helper for TSET*. __newindex chain and metamethod. */
+TValue *lj_meta_tset(lua_State *L, cTValue *o, cTValue *k)
+{
+  TValue tmp;
+  int loop;
+  for (loop = 0; loop < LJ_MAX_IDXCHAIN; loop++) {
+    cTValue *mo;
+    if (tvistab(o)) {
+      GCtab *t = tabV(o);
+      TValue *tv = lj_tab_set(L, t, k);
+      if (!tvisnil(tv) ||
+	  !(mo = lj_meta_fast(L, tabref(t->metatable), MM_newindex))) {
+	if (isblack(obj2gco(t))) lj_gc_barrierback(G(L), t);
+	return tv;
+      }
+    } else if (tvisnil(mo = lj_meta_lookup(L, o, MM_newindex))) {
+      lj_err_optype(L, o, LJ_ERR_OPINDEX);
+      return NULL;  /* unreachable */
+    }
+    if (tvisfunc(mo)) {
+      L->top = mmcall(L, lj_cont_nop, mo, o, k);
+      /* L->top+2 = v filled in by caller. */
+      return NULL;  /* Trigger metamethod call. */
+    }
+    copyTV(L, &tmp, mo);
+    o = &tmp;
+  }
+  lj_err_msg(L, LJ_ERR_SETLOOP);
+  return NULL;  /* unreachable */
+}
+
+static cTValue *str2num(cTValue *o, TValue *n)
+{
+  if (tvisnum(o))
+    return o;
+  else if (tvisstr(o) && lj_str_numconv(strVdata(o), n))
+    return n;
+  else
+    return NULL;
+}
+
+/* Helper for arithmetic instructions. Coercion, metamethod. */
+TValue *lj_meta_arith(lua_State *L, TValue *ra, cTValue *rb, cTValue *rc,
+		      BCReg op)
+{
+  MMS mm = bcmode_mm(op);
+  TValue tempb, tempc;
+  cTValue *b, *c;
+  if ((b = str2num(rb, &tempb)) != NULL &&
+      (c = str2num(rc, &tempc)) != NULL) {  /* Try coercion first. */
+    setnumV(ra, lj_vm_foldarith(numV(b), numV(c), (int)mm-MM_add));
+    return NULL;
+  } else {
+    cTValue *mo = lj_meta_lookup(L, rb, mm);
+    if (tvisnil(mo)) {
+      mo = lj_meta_lookup(L, rc, mm);
+      if (tvisnil(mo)) {
+	if (str2num(rb, &tempb) == NULL) rc = rb;
+	lj_err_optype(L, rc, LJ_ERR_OPARITH);
+	return NULL;  /* unreachable */
+      }
+    }
+    return mmcall(L, lj_cont_ra, mo, rb, rc);
+  }
+}
+
+/* In-place coercion of a number to a string. */
+static LJ_AINLINE int tostring(lua_State *L, TValue *o)
+{
+  if (tvisstr(o)) {
+    return 1;
+  } else if (tvisnum(o)) {
+    setstrV(L, o, lj_str_fromnum(L, &o->n));
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+/* Helper for CAT. Coercion, iterative concat, __concat metamethod. */
+TValue *lj_meta_cat(lua_State *L, TValue *top, int left)
+{
+  do {
+    int n = 1;
+    if (!(tvisstr(top-1) || tvisnum(top-1)) || !tostring(L, top)) {
+      cTValue *mo = lj_meta_lookup(L, top-1, MM_concat);
+      if (tvisnil(mo)) {
+	mo = lj_meta_lookup(L, top, MM_concat);
+	if (tvisnil(mo)) {
+	  if (tvisstr(top-1) || tvisnum(top-1)) top++;
+	  lj_err_optype(L, top-1, LJ_ERR_OPCAT);
+	  return NULL;  /* unreachable */
+	}
+      }
+      /* One of the top two elements is not a string, call __cat metamethod:
+      **
+      ** before:    [...][CAT stack .........................]
+      **                                 top-1     top         top+1 top+2
+      ** pick two:  [...][CAT stack ...] [o1]      [o2]
+      ** setup mm:  [...][CAT stack ...] [cont|?]  [mo|tmtype] [o1]  [o2]
+      ** in asm:    [...][CAT stack ...] [cont|PC] [mo|delta]  [o1]  [o2]
+      **            ^-- func base                              ^-- mm base
+      ** after mm:  [...][CAT stack ...] <--push-- [result]
+      ** next step: [...][CAT stack .............]
+      */
+      copyTV(L, top+2, top)  /* Careful with the order of stack copies! */
+      copyTV(L, top+1, top-1)
+      copyTV(L, top, mo)
+      setcont(top-1, lj_cont_cat);
+      return top+1;  /* Trigger metamethod call. */
+    } else if (strV(top)->len == 0) {  /* Shortcut. */
+      (void)tostring(L, top-1);
+    } else {
+      /* Pick as many strings as possible from the top and concatenate them:
+      **
+      ** before:    [...][CAT stack ...........................]
+      ** pick str:  [...][CAT stack ...] [...... strings ......]
+      ** concat:    [...][CAT stack ...] [result]
+      ** next step: [...][CAT stack ............]
+      */
+      MSize tlen = strV(top)->len;
+      char *buffer;
+      int i;
+      for (n = 1; n <= left && tostring(L, top-n); n++) {
+	MSize len = strV(top-n)->len;
+	if (len >= LJ_MAX_STR - tlen)
+	  lj_err_msg(L, LJ_ERR_STROV);
+	tlen += len;
+      }
+      buffer = lj_str_needbuf(L, &G(L)->tmpbuf, tlen);
+      n--;
+      tlen = 0;
+      for (i = n; i >= 0; i--) {
+	MSize len = strV(top-i)->len;
+	memcpy(buffer + tlen, strVdata(top-i), len);
+	tlen += len;
+      }
+      setstrV(L, top-n, lj_str_new(L, buffer, tlen));
+    }
+    left -= n;
+    top -= n;
+  } while (left >= 1);
+  lj_gc_check_fixtop(L);
+  return NULL;
+}
+
+/* Helper for LEN. __len metamethod. */
+TValue *lj_meta_len(lua_State *L, cTValue *o)
+{
+  cTValue *mo = lj_meta_lookup(L, o, MM_len);
+  if (tvisnil(mo)) {
+    lj_err_optype(L, o, LJ_ERR_OPLEN);
+    return NULL;  /* unreachable */
+  }
+  return mmcall(L, lj_cont_ra, mo, o, niltv(L));
+}
+
+/* Helper for equality comparisons. __eq metamethod. */
+TValue *lj_meta_equal(lua_State *L, GCobj *o1, GCobj *o2, int ne)
+{
+  /* Field metatable must be at same offset for GCtab and GCudata! */
+  cTValue *mo = lj_meta_fast(L, tabref(o1->gch.metatable), MM_eq);
+  if (mo) {
+    TValue *top;
+    int it;
+    if (tabref(o1->gch.metatable) != tabref(o2->gch.metatable)) {
+      cTValue *mo2 = lj_meta_fast(L, tabref(o2->gch.metatable), MM_eq);
+      if (mo2 == NULL || !lj_obj_equal(mo, mo2))
+	return cast(TValue *, (intptr_t)ne);
+    }
+    top = curr_top(L);
+    setcont(top, ne ? lj_cont_condf : lj_cont_condt);
+    copyTV(L, top+1, mo);
+    it = o1->gch.gct == ~LJ_TTAB ? LJ_TTAB : LJ_TUDATA;
+    setgcV(L, top+2, &o1->gch, it);
+    setgcV(L, top+3, &o2->gch, it);
+    return top+2;  /* Trigger metamethod call. */
+  }
+  return cast(TValue *, (intptr_t)ne);
+}
+
+/* Helper for ordered comparisons. String compare, __lt/__le metamethods. */
+TValue *lj_meta_comp(lua_State *L, cTValue *o1, cTValue *o2, int op)
+{
+  if (itype(o1) == itype(o2)) {  /* Never called with two numbers. */
+    if (tvisstr(o1) && tvisstr(o2)) {
+      int32_t res = lj_str_cmp(strV(o1), strV(o2));
+      return cast(TValue *, (intptr_t)(((op&2) ? res <= 0 : res < 0) ^ (op&1)));
+    } else {
+    trymt:
+      while (1) {
+	ASMFunction cont = (op & 1) ? lj_cont_condf : lj_cont_condt;
+	MMS mm = (op & 2) ? MM_le : MM_lt;
+	cTValue *mo = lj_meta_lookup(L, o1, mm);
+	cTValue *mo2 = lj_meta_lookup(L, o2, mm);
+	if (tvisnil(mo) || !lj_obj_equal(mo, mo2)) {
+	  if (op & 2) {  /* MM_le not found: retry with MM_lt. */
+	    cTValue *ot = o1; o1 = o2; o2 = ot;  /* Swap operands. */
+	    op ^= 3;  /* Use LT and flip condition. */
+	    continue;
+	  }
+	  goto err;
+	}
+	return mmcall(L, cont, mo, o1, o2);
+      }
+    }
+  } else if (tvisbool(o1) && tvisbool(o2)) {
+    goto trymt;
+  } else {
+  err:
+    lj_err_comp(L, o1, o2);
+    return NULL;
+  }
+}
+
+/* Helper for calls. __call metamethod. */
+void lj_meta_call(lua_State *L, TValue *func, TValue *top)
+{
+  cTValue *mo = lj_meta_lookup(L, func, MM_call);
+  TValue *p;
+  if (!tvisfunc(mo))
+    lj_err_optype_call(L, func);
+  for (p = top; p > func; p--) copyTV(L, p, p-1);
+  copyTV(L, func, mo);
+}
+
+/* Helper for FORI. Coercion. */
+void lj_meta_for(lua_State *L, TValue *base)
+{
+  if (!str2num(base, base)) lj_err_msg(L, LJ_ERR_FORINIT);
+  if (!str2num(base+1, base+1)) lj_err_msg(L, LJ_ERR_FORLIM);
+  if (!str2num(base+2, base+2)) lj_err_msg(L, LJ_ERR_FORSTEP);
+}
+

+ 33 - 0
src/lj_meta.h

@@ -0,0 +1,33 @@
+/*
+** Metamethod handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_META_H
+#define _LJ_META_H
+
+#include "lj_obj.h"
+
+/* Metamethod handling */
+LJ_FUNC void lj_meta_init(lua_State *L);
+LJ_FUNC cTValue *lj_meta_cache(GCtab *mt, MMS mm, GCstr *name);
+LJ_FUNC cTValue *lj_meta_lookup(lua_State *L, cTValue *o, MMS mm);
+
+#define lj_meta_fastg(g, mt, mm) \
+  ((mt) == NULL ? NULL : ((mt)->nomm & (1u<<(mm))) ? NULL : \
+   lj_meta_cache(mt, mm, strref((g)->mmname[mm])))
+#define lj_meta_fast(L, mt, mm)	lj_meta_fastg(G(L), mt, mm)
+
+/* C helpers for some instructions, called from assembler VM. */
+LJ_FUNCA cTValue *lj_meta_tget(lua_State *L, cTValue *o, cTValue *k);
+LJ_FUNCA TValue *lj_meta_tset(lua_State *L, cTValue *o, cTValue *k);
+LJ_FUNCA TValue *lj_meta_arith(lua_State *L, TValue *ra, cTValue *rb,
+			       cTValue *rc, BCReg op);
+LJ_FUNCA TValue *lj_meta_cat(lua_State *L, TValue *top, int left);
+LJ_FUNCA TValue *lj_meta_len(lua_State *L, cTValue *o);
+LJ_FUNCA TValue *lj_meta_equal(lua_State *L, GCobj *o1, GCobj *o2, int ne);
+LJ_FUNCA TValue *lj_meta_comp(lua_State *L, cTValue *o1, cTValue *o2, int op);
+LJ_FUNCA void lj_meta_call(lua_State *L, TValue *func, TValue *top);
+LJ_FUNCA void lj_meta_for(lua_State *L, TValue *base);
+
+#endif

+ 41 - 0
src/lj_obj.c

@@ -0,0 +1,41 @@
+/*
+** Miscellaneous object handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_obj_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+/* Object type names. */
+LJ_DATADEF const char *const lj_obj_typename[] = {  /* ORDER LUA_T */
+  "no value", "nil", "boolean", "userdata", "number", "string",
+  "table", "function", "userdata", "thread", "proto", "upval"
+};
+
+LJ_DATADEF const char *const lj_obj_itypename[] = {  /* ORDER LJ_T */
+  "nil", "boolean", "boolean", "userdata", "string", "upval", "thread",
+  "proto", "function", "deadkey", "table", "userdata", "number"
+};
+
+/* Compare two objects without calling metamethods. */
+int lj_obj_equal(cTValue *o1, cTValue *o2)
+{
+  if (itype(o1) == itype(o2)) {
+    if (tvispri(o1))
+      return 1;
+    if (!tvisnum(o1)) {
+#if LJ_64
+      if (tvislightud(o1))
+	return o1->u64 == o2->u64;
+      else
+#endif
+	return gcrefeq(o1->gcr, o2->gcr);
+    }
+  } else if (!tvisnum(o1) || !tvisnum(o2)) {
+    return 0;
+  }
+  return numV(o1) == numV(o2);
+}
+

+ 676 - 0
src/lj_obj.h

@@ -0,0 +1,676 @@
+/*
+** LuaJIT VM tags, values and objects.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#ifndef _LJ_OBJ_H
+#define _LJ_OBJ_H
+
+#include "lua.h"
+#include "lj_def.h"
+#include "lj_arch.h"
+
+/* -- Memory references (32 bit address space) ---------------------------- */
+
+/* Memory size. */
+typedef uint32_t MSize;
+
+/* Memory reference */
+typedef struct MRef {
+  uint32_t ptr32;	/* Pseudo 32 bit pointer. */
+} MRef;
+
+#define mref(r, t)	((t *)(void *)(uintptr_t)(r).ptr32)
+
+#define setmref(r, p)	((r).ptr32 = (uint32_t)(uintptr_t)(void *)(p))
+#define setmrefr(r, v)	((r).ptr32 = (v).ptr32)
+
+/* -- GC object references (32 bit address space) ------------------------- */
+
+/* GCobj reference */
+typedef struct GCRef {
+  uint32_t gcptr32;	/* Pseudo 32 bit pointer. */
+} GCRef;
+
+/* Common GC header for all collectable objects. */
+#define GCHeader	GCRef nextgc; uint8_t marked; uint8_t gct
+/* This occupies 6 bytes, so use the next 2 bytes for non-32 bit fields. */
+
+#define gcref(r)	((GCobj *)(uintptr_t)(r).gcptr32)
+#define gcrefp(r, t)	((t *)(void *)(uintptr_t)(r).gcptr32)
+#define gcrefu(r)	((r).gcptr32)
+#define gcrefi(r)	((int32_t)(r).gcptr32)
+#define gcrefeq(r1, r2)	((r1).gcptr32 == (r2).gcptr32)
+#define gcnext(gc)	(gcref((gc)->gch.nextgc))
+
+#define setgcref(r, gc)	((r).gcptr32 = (uint32_t)(uintptr_t)&(gc)->gch)
+#define setgcrefi(r, i)	((r).gcptr32 = (uint32_t)(i))
+#define setgcrefp(r, p)	((r).gcptr32 = (uint32_t)(uintptr_t)(p))
+#define setgcrefnull(r)	((r).gcptr32 = 0)
+#define setgcrefr(r, v)	((r).gcptr32 = (v).gcptr32)
+
+/* IMPORTANT NOTE:
+**
+** All uses of the setgcref* macros MUST be accompanied with a write barrier.
+**
+** This is to ensure the integrity of the incremental GC. The invariant
+** to preserve is that a black object never points to a white object.
+** I.e. never store a white object into a field of a black object.
+**
+** It's ok to LEAVE OUT the write barrier ONLY in the following cases:
+** - The source is not a GC object (NULL).
+** - The target is a GC root. I.e. everything in global_State.
+** - The target is a lua_State field (threads are never black).
+** - The target is a stack slot, see setgcV et al.
+** - The target is an open upvalue, i.e. pointing to a stack slot.
+** - The target is a newly created object (i.e. marked white). But make
+**   sure nothing invokes the GC inbetween.
+** - The target and the source are the same object (self-reference).
+** - The target already contains the object (e.g. moving elements around).
+**
+** The most common case is a store to a stack slot. All other cases where
+** a barrier has been omitted are annotated with a NOBARRIER comment.
+**
+** The same logic applies for stores to table slots (array part or hash
+** part). ALL uses of lj_tab_set* require a barrier for the stored *value*
+** (if it's a GC object). The barrier for the *key* is already handled
+** internally by lj_tab_newkey.
+*/
+
+/* -- Common type definitions --------------------------------------------- */
+
+/* Types for handling bytecodes. Need this here, details in lj_bc.h. */
+typedef uint32_t BCIns;  /* Bytecode instruction. */
+typedef uint32_t BCPos;  /* Bytecode position. */
+typedef uint32_t BCReg;  /* Bytecode register. */
+typedef int32_t BCLine;  /* Bytecode line number. */
+
+/* Internal assembler functions. Never call these directly from C. */
+typedef void (*ASMFunction)(void);
+
+/* Resizable string buffer. Need this here, details in lj_str.h. */
+typedef struct SBuf {
+  char *buf;		/* String buffer base. */
+  MSize n;		/* String buffer length. */
+  MSize sz;		/* String buffer size. */
+} SBuf;
+
+/* -- Tags and values ----------------------------------------------------- */
+
+/* Frame link. */
+typedef union {
+  int32_t ftsz;		/* Frame type and size of previous frame. */
+  MRef pcr;		/* Overlaps PC for Lua frames. */
+} FrameLink;
+
+/* Tagged value. */
+typedef LJ_ALIGN(8) union TValue {
+  uint64_t u64;		/* 64 bit pattern overlaps number. */
+  lua_Number n;		/* Number object overlaps split tag/value object. */
+  struct {
+    LJ_ENDIAN_LOHI(
+      GCRef gcr;	/* GCobj reference (if any). */
+    , int32_t it;	/* Internal object tag. Must overlap MSW of number. */
+    )
+  };
+  struct {
+    LJ_ENDIAN_LOHI(
+      GCRef func;	/* Function for next frame (or dummy L). */
+    , FrameLink tp;	/* Link to previous frame. */
+    )
+  } fr;
+  struct {
+    LJ_ENDIAN_LOHI(
+      uint32_t lo;	/* Lower 32 bits of number. */
+    , uint32_t hi;	/* Upper 32 bits of number. */
+    )
+  } u32;
+} TValue;
+
+typedef const TValue cTValue;
+
+#define tvref(r)	(mref(r, TValue))
+
+/* More external and GCobj tags for internal objects. */
+#define LAST_TT		LUA_TTHREAD
+
+#define LUA_TPROTO	(LAST_TT+1)
+#define LUA_TUPVAL	(LAST_TT+2)
+#define LUA_TDEADKEY	(LAST_TT+3)
+
+/* Internal object tags.
+**
+** Internal tags overlap the MSW of a number object (must be a double).
+** Interpreted as a double these are special NaNs. The FPU only generates
+** one type of NaN (0xfff8_0000_0000_0000). So MSWs > 0xfff80000 are available
+** for use as internal tags. Small negative numbers are used to shorten the
+** encoding of type comparisons (reg/mem against sign-ext. 8 bit immediate).
+**
+**                  ---MSW---.---LSW---
+** primitive types |  itype  |         |
+** lightuserdata   |  itype  |  void * |  (32 bit platforms)
+** lightuserdata   |fffc|    void *    |  (64 bit platforms, 48 bit pointers)
+** GC objects      |  itype  |  GCRef  |
+** number           -------double------
+**
+** ORDER LJ_T
+** Primitive types nil/false/true must be first, lightuserdata next.
+** GC objects are at the end, table/userdata must be lowest.
+** Also check lj_ir.h for similar ordering constraints.
+*/
+#define LJ_TNIL			(-1)
+#define LJ_TFALSE		(-2)
+#define LJ_TTRUE		(-3)
+#define LJ_TLIGHTUD		(-4)
+#define LJ_TSTR			(-5)
+#define LJ_TUPVAL		(-6)
+#define LJ_TTHREAD		(-7)
+#define LJ_TPROTO		(-8)
+#define LJ_TFUNC		(-9)
+#define LJ_TDEADKEY		(-10)
+#define LJ_TTAB			(-11)
+#define LJ_TUDATA		(-12)
+/* This is just the canonical number type used in some places. */
+#define LJ_TNUMX		(-13)
+
+#if LJ_64
+#define LJ_TISNUM		((uint32_t)0xfff80000)
+#else
+#define LJ_TISNUM		((uint32_t)LJ_TNUMX)
+#endif
+#define LJ_TISTRUECOND		((uint32_t)LJ_TFALSE)
+#define LJ_TISPRI		((uint32_t)LJ_TTRUE)
+#define LJ_TISGCV		((uint32_t)(LJ_TSTR+1))
+#define LJ_TISTABUD		((uint32_t)LJ_TTAB)
+
+/* -- TValue getters/setters ---------------------------------------------- */
+
+/* Macros to test types. */
+#define itype(o)	((o)->it)
+#define uitype(o)	((uint32_t)itype(o))
+#define tvisnil(o)	(itype(o) == LJ_TNIL)
+#define tvisfalse(o)	(itype(o) == LJ_TFALSE)
+#define tvistrue(o)	(itype(o) == LJ_TTRUE)
+#define tvisbool(o)	(tvisfalse(o) || tvistrue(o))
+#if LJ_64
+#define tvislightud(o)	((itype(o) >> 16) == LJ_TLIGHTUD)
+#else
+#define tvislightud(o)	(itype(o) == LJ_TLIGHTUD)
+#endif
+#define tvisstr(o)	(itype(o) == LJ_TSTR)
+#define tvisfunc(o)	(itype(o) == LJ_TFUNC)
+#define tvisthread(o)	(itype(o) == LJ_TTHREAD)
+#define tvisproto(o)	(itype(o) == LJ_TPROTO)
+#define tvistab(o)	(itype(o) == LJ_TTAB)
+#define tvisudata(o)	(itype(o) == LJ_TUDATA)
+#define tvisnum(o)	(uitype(o) <= LJ_TISNUM)
+
+#define tvistruecond(o)	(uitype(o) < LJ_TISTRUECOND)
+#define tvispri(o)	(uitype(o) >= LJ_TISPRI)
+#define tvistabud(o)	(uitype(o) <= LJ_TISTABUD)  /* && !tvisnum() */
+#define tvisgcv(o) \
+  ((uitype(o) - LJ_TISGCV) > ((uint32_t)LJ_TNUMX - LJ_TISGCV))
+
+/* Special macros to test numbers for NaN, +0, -0, +1 and raw equality. */
+#define tvisnan(o)	((o)->n != (o)->n)
+#define tvispzero(o)	((o)->u64 == 0)
+#define tvismzero(o)	((o)->u64 == U64x(80000000,00000000))
+#define tvispone(o)	((o)->u64 == U64x(3ff00000,00000000))
+#define rawnumequal(o1, o2)	((o1)->u64 == (o2)->u64)
+
+/* Macros to convert type ids. */
+#if LJ_64
+#define itypemap(o) \
+  (tvisnum(o) ? ~LJ_TNUMX : tvislightud(o) ? ~LJ_TLIGHTUD : ~itype(o))
+#else
+#define itypemap(o)	(tvisnum(o) ? ~LJ_TNUMX : ~itype(o))
+#endif
+
+/* Macros to get tagged values. */
+#define gcval(o)	(gcref((o)->gcr))
+#define boolV(o)	check_exp(tvisbool(o), (LJ_TFALSE - (o)->it))
+#if LJ_64
+#define lightudV(o)	check_exp(tvislightud(o), \
+			  (void *)((o)->u64 & U64x(0000ffff,ffffffff)))
+#else
+#define lightudV(o)	check_exp(tvislightud(o), gcrefp((o)->gcr, void))
+#endif
+#define gcV(o)		check_exp(tvisgcv(o), gcval(o))
+#define strV(o)		check_exp(tvisstr(o), &gcval(o)->str)
+#define funcV(o)	check_exp(tvisfunc(o), &gcval(o)->fn)
+#define threadV(o)	check_exp(tvisthread(o), &gcval(o)->th)
+#define protoV(o)	check_exp(tvisproto(o), &gcval(o)->pt)
+#define tabV(o)		check_exp(tvistab(o), &gcval(o)->tab)
+#define udataV(o)	check_exp(tvisudata(o), &gcval(o)->ud)
+#define numV(o)		check_exp(tvisnum(o), (o)->n)
+
+/* Macros to set tagged values. */
+#define setitype(o, i)		((o)->it = (i))
+#define setnilV(o)		((o)->it = LJ_TNIL)
+#define setboolV(o, x)		((o)->it = LJ_TFALSE-(x))
+
+#if LJ_64
+#define checklightudptr(L, p) \
+  (((uint64_t)(p) >> 48) ? (lj_err_msg(L, LJ_ERR_BADLU), NULL) : (p))
+#define setlightudV(o, x) \
+  ((o)->u64 = (uint64_t)(x) | (((uint64_t)LJ_TLIGHTUD) << 48))
+#define setcont(o, x) \
+  ((o)->u64 = (uint64_t)(x) - (uint64_t)lj_vm_asm_begin)
+#else
+#define checklightudptr(L, p)	(p)
+#define setlightudV(o, x) \
+  { TValue *i_o = (o); \
+    setgcrefp(i_o->gcr, (x)); i_o->it = LJ_TLIGHTUD; }
+#define setcont(o, x) \
+  { TValue *i_o = (o); \
+    setgcrefp(i_o->gcr, (x)); i_o->it = LJ_TLIGHTUD; }
+#endif
+
+#define tvchecklive(g, o) \
+  lua_assert(!tvisgcv(o) || \
+  ((~itype(o) == gcval(o)->gch.gct) && !isdead(g, gcval(o))))
+
+#define setgcV(L, o, x, itype) \
+  { TValue *i_o = (o); \
+    setgcrefp(i_o->gcr, &(x)->nextgc); i_o->it = itype; \
+    tvchecklive(G(L), i_o); }
+#define setstrV(L, o, x)	setgcV(L, o, x, LJ_TSTR)
+#define setthreadV(L, o, x)	setgcV(L, o, x, LJ_TTHREAD)
+#define setprotoV(L, o, x)	setgcV(L, o, x, LJ_TPROTO)
+#define setfuncV(L, o, x)	setgcV(L, o, &(x)->l, LJ_TFUNC)
+#define settabV(L, o, x)	setgcV(L, o, x, LJ_TTAB)
+#define setudataV(L, o, x)	setgcV(L, o, x, LJ_TUDATA)
+
+#define setnumV(o, x)		((o)->n = (x))
+#define setnanV(o)		((o)->u64 = U64x(fff80000,00000000))
+#define setintV(o, i)		((o)->n = cast_num((int32_t)(i)))
+
+/* Copy tagged values. */
+#define copyTV(L, o1, o2) \
+  { cTValue *i_o2 = (o2); TValue *i_o1 = (o1); \
+    *i_o1 = *i_o2; tvchecklive(G(L), i_o1); }
+
+/* -- String object ------------------------------------------------------- */
+
+/* String object header. String payload follows. */
+typedef struct GCstr {
+  GCHeader;
+  uint8_t reserved;	/* Used by lexer for fast lookup of reserved words. */
+  uint8_t unused;
+  MSize hash;		/* Hash of string. */
+  MSize len;		/* Size of string. */
+} GCstr;
+
+#define strref(r)	(&gcref((r))->str)
+#define strdata(s)	((const char *)((s)+1))
+#define strdatawr(s)	((char *)((s)+1))
+#define strVdata(o)	strdata(strV(o))
+#define sizestring(s)	(sizeof(struct GCstr)+(s)->len+1)
+
+/* -- Userdata object ----------------------------------------------------- */
+
+/* Userdata object. Payload follows. */
+typedef struct GCudata {
+  GCHeader;
+  uint8_t unused1;
+  uint8_t unused2;
+  GCRef env;		/* Should be at same offset in GCfunc. */
+  MSize len;		/* Size of payload. */
+  GCRef metatable;	/* Must be at same offset in GCtab. */
+  uint32_t align1;	/* To force 8 byte alignment of the payload. */
+} GCudata;
+
+#define uddata(u)	((void *)((u)+1))
+#define sizeudata(u)	(sizeof(struct GCudata)+(u)->len)
+
+/* -- Prototype object ---------------------------------------------------- */
+
+/* Split constant array. Collectables are below, numbers above pointer. */
+typedef union ProtoK {
+  lua_Number *n;	/* Numbers. */
+  GCRef *gc;		/* Collectable objects (strings/table/proto). */
+} ProtoK;
+
+#define SCALE_NUM_GCO	((int32_t)sizeof(lua_Number)/sizeof(GCRef))
+#define round_nkgc(n)	(((n) + SCALE_NUM_GCO-1) & ~(SCALE_NUM_GCO-1))
+
+typedef struct VarInfo {
+  GCstr *name;		/* Local variable name. */
+  BCPos startpc;	/* First point where the local variable is active. */
+  BCPos endpc;		/* First point where the local variable is dead. */
+} VarInfo;
+
+typedef struct GCproto {
+  GCHeader;
+  uint8_t numparams;	/* Number of parameters. */
+  uint8_t framesize;	/* Fixed frame size. */
+  MSize sizebc;		/* Number of bytecode instructions. */
+  GCRef gclist;
+  ProtoK k;		/* Split constant array (points to the middle). */
+  BCIns *bc;		/* Array of bytecode instructions. */
+  int16_t *uv;		/* Upvalue list. local >= 0. parent uv < 0. */
+  MSize sizekgc;	/* Number of collectable constants. */
+  MSize sizekn;		/* Number of lua_Number constants. */
+  uint8_t sizeuv;	/* Number of upvalues. */
+  uint8_t flags;	/* Miscellaneous flags (see below). */
+  uint16_t trace;	/* Anchor for chain of root traces. */
+  /* ------ The following fields are for debugging/tracebacks only ------ */
+  MSize sizelineinfo;	/* Size of lineinfo array (may be 0). */
+  MSize sizevarinfo;	/* Size of local var info array (may be 0). */
+  MSize sizeuvname;	/* Size of upvalue names array (may be 0). */
+  BCLine linedefined;	/* First line of the function definition. */
+  BCLine lastlinedefined;  /* Last line of the function definition. */
+  BCLine *lineinfo;	/* Map from bytecode instructions to source lines. */
+  struct VarInfo *varinfo;  /* Names and extents of local variables. */
+  GCstr **uvname;	/* Upvalue names. */
+  GCstr *chunkname;	/* Name of the chunk this function was defined in. */
+} GCproto;
+
+#define PROTO_IS_VARARG		0x01
+#define PROTO_HAS_FNEW		0x02
+#define PROTO_HAS_RETURN	0x04
+#define PROTO_FIXUP_RETURN	0x08
+#define PROTO_NO_JIT		0x10
+#define PROTO_HAS_ILOOP		0x20
+
+/* -- Upvalue object ------------------------------------------------------ */
+
+typedef struct GCupval {
+  GCHeader;
+  uint8_t closed;	/* Set if closed (i.e. uv->v == &uv->u.value). */
+  uint8_t unused;
+  union {
+    TValue tv;		/* If closed: the value itself. */
+    struct {		/* If open: double linked list, anchored at thread. */
+      GCRef prev;
+      GCRef next;
+    };
+  };
+  TValue *v;		/* Points to stack slot (open) or above (closed). */
+#if LJ_32
+  int32_t unusedv;	/* For consistent alignment (32 bit only). */
+#endif
+} GCupval;
+
+#define uvprev(uv_)	(&gcref((uv_)->prev)->uv)
+#define uvnext(uv_)	(&gcref((uv_)->next)->uv)
+
+/* -- Function object (closures) ------------------------------------------ */
+
+/* Common header for functions. env should be at same offset in GCudata. */
+#define GCfuncHeader \
+  GCHeader; uint8_t ffid; uint8_t nupvalues; \
+  GCRef env; GCRef gclist; ASMFunction gate
+
+typedef struct GCfuncC {
+  GCfuncHeader;
+  lua_CFunction f;	/* C function to be called. */
+  TValue upvalue[1];	/* Array of upvalues (TValue). */
+} GCfuncC;
+
+typedef struct GCfuncL {
+  GCfuncHeader;
+  GCRef pt;		/* Link to prototype this function is based on. */
+  GCRef uvptr[1];	/* Array of _pointers_ to upvalue objects (GCupval). */
+} GCfuncL;
+
+typedef union GCfunc {
+  GCfuncC c;
+  GCfuncL l;
+} GCfunc;
+
+#define FF_LUA		0
+#define FF_C		1
+#define isluafunc(fn)	((fn)->c.ffid == FF_LUA)
+#define iscfunc(fn)	((fn)->c.ffid == FF_C)
+#define isffunc(fn)	((fn)->c.ffid > FF_C)
+#define funcproto(fn)	check_exp(isluafunc(fn), &gcref((fn)->l.pt)->pt)
+#define sizeCfunc(n)	(sizeof(GCfuncC) + sizeof(TValue)*((n)-1))
+#define sizeLfunc(n)	(sizeof(GCfuncL) + sizeof(TValue *)*((n)-1))
+
+/* -- Table object -------------------------------------------------------- */
+
+/* Hash node. */
+typedef struct Node {
+  TValue val;		/* Value object. Must be first field. */
+  TValue key;		/* Key object. */
+  MRef next;		/* Hash chain. */
+  int32_t unused;	/* For consistent alignment. */
+} Node;
+
+LJ_STATIC_ASSERT(offsetof(Node, val) == 0);
+
+typedef struct GCtab {
+  GCHeader;
+  uint8_t nomm;		/* Negative cache for fast metamethods. */
+  int8_t colo;		/* Array colocation. */
+  MRef array;		/* Array part. */
+  GCRef gclist;
+  GCRef metatable;	/* Must be at same offset in GCudata. */
+  MRef node;		/* Hash part. */
+  uint32_t asize;	/* Size of array part (keys [0, asize-1]). */
+  uint32_t hmask;	/* Hash part mask (size of hash part - 1). */
+  MRef lastfree;	/* Any free position is before this position. */
+} GCtab;
+
+#define sizetabcolo(n)	((n)*sizeof(TValue) + sizeof(GCtab))
+#define tabref(r)	(&gcref((r))->tab)
+#define noderef(r)	(mref((r), Node))
+#define nextnode(n)	(mref((n)->next, Node))
+
+/* -- State objects ------------------------------------------------------- */
+
+/* VM states. */
+enum {
+  LJ_VMST_INTERP,	/* Interpreter. */
+  LJ_VMST_C,		/* C function. */
+  LJ_VMST_GC,		/* Garbage collector. */
+  LJ_VMST_EXIT,		/* Trace exit handler. */
+  LJ_VMST_RECORD,	/* Trace recorder. */
+  LJ_VMST_OPT,		/* Optimizer. */
+  LJ_VMST_ASM,		/* Assembler. */
+  LJ_VMST__MAX
+};
+
+#define setvmstate(g, st)	((g)->vmstate = ~LJ_VMST_##st)
+
+/* Metamethods. */
+#define MMDEF(_) \
+  _(index) _(newindex) _(gc) _(mode) _(eq) \
+  /* Only the above (fast) metamethods are negative cached (max. 8). */ \
+  _(len) _(lt) _(le) _(concat) _(call) \
+  /* The following must be in ORDER ARITH. */ \
+  _(add) _(sub) _(mul) _(div) _(mod) _(pow) _(unm) \
+  /* The following are used in the standard libraries. */ \
+  _(metatable) _(tostring)
+
+typedef enum {
+#define MMENUM(name)	MM_##name,
+MMDEF(MMENUM)
+#undef MMENUM
+  MM_MAX,
+  MM____ = MM_MAX,
+  MM_FAST = MM_eq
+} MMS;
+
+#define BASEMT_MAX	((~LJ_TNUMX)+1)
+
+typedef struct GCState {
+  MSize total;		/* Memory currently allocated. */
+  MSize threshold;	/* Memory threshold. */
+  uint8_t currentwhite;	/* Current white color. */
+  uint8_t state;	/* GC state. */
+  uint8_t unused1;
+  uint8_t unused2;
+  MSize sweepstr;	/* Sweep position in string table. */
+  GCRef root;		/* List of all collectable objects. */
+  GCRef *sweep;		/* Sweep position in root list. */
+  GCRef gray;		/* List of gray objects. */
+  GCRef grayagain;	/* List of objects for atomic traversal. */
+  GCRef weak;		/* List of weak tables (to be cleared). */
+  GCRef mmudata;	/* List of userdata (to be finalized). */
+  MSize stepmul;	/* Incremental GC step granularity. */
+  MSize debt;		/* Debt (how much GC is behind schedule). */
+  MSize estimate;	/* Estimate of memory actually in use. */
+  MSize pause;		/* Pause between successive GC cycles. */
+} GCState;
+
+/* Global state, shared by all threads of a Lua universe. */
+typedef struct global_State {
+  GCRef *strhash;	/* String hash table (hash chain anchors). */
+  MSize strmask;	/* String hash mask (size of hash table - 1). */
+  MSize strnum;		/* Number of strings in hash table. */
+  lua_Alloc allocf;	/* Memory allocator. */
+  void *allocd;		/* Memory allocator data. */
+  GCState gc;		/* Garbage collector. */
+  SBuf tmpbuf;		/* Temporary buffer for string concatenation. */
+  Node nilnode;		/* Fallback 1-element hash part (nil key and value). */
+  uint8_t hookmask;	/* Hook mask. */
+  uint8_t dispatchmode;	/* Dispatch mode. */
+  uint8_t vmevmask;	/* VM event mask. */
+  uint8_t unused1;
+  GCRef mainthref;	/* Link to main thread. */
+  TValue registrytv;	/* Anchor for registry. */
+  TValue tmptv;		/* Temporary TValue. */
+  GCupval uvhead;	/* Head of double-linked list of all open upvalues. */
+  int32_t hookcount;	/* Instruction hook countdown. */
+  int32_t hookcstart;	/* Start count for instruction hook counter. */
+  lua_Hook hookf;	/* Hook function. */
+  lua_CFunction panic;	/* Called as a last resort for errors. */
+  volatile int32_t vmstate;  /* VM state or current JIT code trace number. */
+  GCRef jit_L;		/* Current JIT code lua_State or NULL. */
+  MRef jit_base;	/* Current JIT code L->base. */
+  GCRef basemt[BASEMT_MAX];  /* Metatables for base types. */
+  GCRef mmname[MM_MAX];	/* Array holding metamethod names. */
+} global_State;
+
+#define mainthread(g)	(&gcref(g->mainthref)->th)
+#define niltv(L) \
+  check_exp(tvisnil(&G(L)->nilnode.val), &G(L)->nilnode.val)
+#define niltvg(g) \
+  check_exp(tvisnil(&(g)->nilnode.val), &(g)->nilnode.val)
+
+/* Hook management. Hook event masks are defined in lua.h. */
+#define HOOK_EVENTMASK		0x0f
+#define HOOK_ACTIVE		0x10
+#define HOOK_VMEVENT		0x20
+#define HOOK_GC			0x40
+#define hook_active(g)		((g)->hookmask & HOOK_ACTIVE)
+#define hook_enter(g)		((g)->hookmask |= HOOK_ACTIVE)
+#define hook_entergc(g)		((g)->hookmask |= (HOOK_ACTIVE|HOOK_GC))
+#define hook_vmevent(g)		((g)->hookmask |= (HOOK_ACTIVE|HOOK_VMEVENT))
+#define hook_leave(g)		((g)->hookmask &= ~HOOK_ACTIVE)
+#define hook_save(g)		((g)->hookmask & ~HOOK_EVENTMASK)
+#define hook_restore(g, h) \
+  ((g)->hookmask = ((g)->hookmask & HOOK_EVENTMASK) | (h))
+
+/* Per-thread state object. */
+struct lua_State {
+  GCHeader;
+  uint8_t dummy_ffid;	/* Fake FF_C for curr_funcisL() on dummy frames. */
+  uint8_t status;	/* Thread status. */
+  MRef glref;		/* Link to global state. */
+  GCRef gclist;		/* GC chain. */
+  TValue *base;		/* Base of currently executing function. */
+  TValue *top;		/* First free slot in the stack. */
+  TValue *maxstack;	/* Last free slot in the stack. */
+  TValue *stack;	/* Stack base. */
+  GCRef openupval;	/* List of open upvalues in the stack. */
+  GCRef env;		/* Thread environment (table of globals). */
+  void *cframe;		/* End of C stack frame chain. */
+  MSize stacksize;	/* True stack size (incl. LJ_STACK_EXTRA). */
+};
+
+#define G(L)			(mref(L->glref, global_State))
+#define registry(L)		(&G(L)->registrytv)
+
+/* Macros to access the currently executing (Lua) function. */
+#define curr_func(L)		(&gcref((L->base-1)->fr.func)->fn)
+#define curr_funcisL(L)		(isluafunc(curr_func(L)))
+#define curr_proto(L)		(funcproto(curr_func(L)))
+#define curr_topL(L)		(L->base + curr_proto(L)->framesize)
+#define curr_top(L)		(curr_funcisL(L) ? curr_topL(L) : L->top)
+
+/* -- GC object definition and conversions -------------------------------- */
+
+/* GC header for generic access to common fields of GC objects. */
+typedef struct GChead {
+  GCHeader;
+  uint8_t unused1;
+  uint8_t unused2;
+  GCRef env;
+  GCRef gclist;
+  GCRef metatable;
+} GChead;
+
+/* The env field SHOULD be at the same offset for all GC objects. */
+LJ_STATIC_ASSERT(offsetof(GChead, env) == offsetof(GCfuncL, env));
+LJ_STATIC_ASSERT(offsetof(GChead, env) == offsetof(GCudata, env));
+
+/* The metatable field MUST be at the same offset for all GC objects. */
+LJ_STATIC_ASSERT(offsetof(GChead, metatable) == offsetof(GCtab, metatable));
+LJ_STATIC_ASSERT(offsetof(GChead, metatable) == offsetof(GCudata, metatable));
+
+/* The gclist field MUST be at the same offset for all GC objects. */
+LJ_STATIC_ASSERT(offsetof(GChead, gclist) == offsetof(lua_State, gclist));
+LJ_STATIC_ASSERT(offsetof(GChead, gclist) == offsetof(GCproto, gclist));
+LJ_STATIC_ASSERT(offsetof(GChead, gclist) == offsetof(GCfuncL, gclist));
+LJ_STATIC_ASSERT(offsetof(GChead, gclist) == offsetof(GCtab, gclist));
+
+typedef union GCobj {
+  GChead gch;
+  GCstr str;
+  GCupval uv;
+  lua_State th;
+  GCproto pt;
+  GCfunc fn;
+  GCtab tab;
+  GCudata ud;
+} GCobj;
+
+/* Macros to convert a GCobj pointer into a specific value. */
+#define gco2str(o)	check_exp((o)->gch.gct == ~LJ_TSTR, &(o)->str)
+#define gco2uv(o)	check_exp((o)->gch.gct == ~LJ_TUPVAL, &(o)->uv)
+#define gco2th(o)	check_exp((o)->gch.gct == ~LJ_TTHREAD, &(o)->th)
+#define gco2pt(o)	check_exp((o)->gch.gct == ~LJ_TPROTO, &(o)->pt)
+#define gco2func(o)	check_exp((o)->gch.gct == ~LJ_TFUNC, &(o)->fn)
+#define gco2tab(o)	check_exp((o)->gch.gct == ~LJ_TTAB, &(o)->tab)
+#define gco2ud(o)	check_exp((o)->gch.gct == ~LJ_TUDATA, &(o)->ud)
+
+/* Macro to convert any collectable object into a GCobj pointer. */
+#define obj2gco(v)	(cast(GCobj *, (v)))
+
+/* -- Number to integer conversion ---------------------------------------- */
+
+static LJ_AINLINE int32_t lj_num2bit(lua_Number n)
+{
+  TValue o;
+  o.n = n + 6755399441055744.0;  /* 2^52 + 2^51 */
+  return (int32_t)o.u32.lo;
+}
+
+#if (defined(__i386__) || defined(_M_IX86)) && !defined(__SSE2__)
+#define lj_num2int(n)   lj_num2bit((n))
+#else
+#define lj_num2int(n)   ((int32_t)(n))
+#endif
+
+/* -- Miscellaneous object handling --------------------------------------- */
+
+/* Names and maps for internal and external object tags. */
+LJ_DATA const char *const lj_obj_typename[1+LUA_TUPVAL+1];
+LJ_DATA const char *const lj_obj_itypename[~LJ_TNUMX+1];
+
+#define typename(o)	(lj_obj_itypename[itypemap(o)])
+
+/* Compare two objects without calling metamethods. */
+LJ_FUNC int lj_obj_equal(cTValue *o1, cTValue *o2);
+
+#ifdef LUA_USE_ASSERT
+#include "lj_gc.h"
+#endif
+
+#endif

+ 79 - 0
src/lj_opt_dce.c

@@ -0,0 +1,79 @@
+/*
+** DCE: Dead Code Elimination. Pre-LOOP only -- ASM already performs DCE.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_dce_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+
+/* Scan through all snapshots and mark all referenced instructions. */
+static void dce_marksnap(jit_State *J)
+{
+  SnapNo i, nsnap = J->cur.nsnap;
+  for (i = 0; i < nsnap; i++) {
+    SnapShot *snap = &J->cur.snap[i];
+    IRRef2 *map = &J->cur.snapmap[snap->mapofs];
+    BCReg s, nslots = snap->nslots;
+    for (s = 0; s < nslots; s++) {
+      IRRef ref = snap_ref(map[s]);
+      if (!irref_isk(ref))
+	irt_setmark(IR(ref)->t);
+    }
+  }
+}
+
+/* Backwards propagate marks. Replace unused instructions with NOPs. */
+static void dce_propagate(jit_State *J)
+{
+  IRRef1 *pchain[IR__MAX];
+  IRRef ins;
+  uint32_t i;
+  for (i = 0; i < IR__MAX; i++) pchain[i] = &J->chain[i];
+  for (ins = J->cur.nins-1; ins >= REF_FIRST; ins--) {
+    IRIns *ir = IR(ins);
+    if (irt_ismarked(ir->t)) {
+      irt_clearmark(ir->t);
+      pchain[ir->o] = &ir->prev;
+    } else if (!(irt_isguard(ir->t) || irm_sideeff(lj_ir_mode[ir->o]))) {
+      *pchain[ir->o] = ir->prev;  /* Reroute original instruction chain. */
+      *pchain[IR_NOP] = (IRRef1)ins;
+      ir->t.irt = IRT_NIL;
+      ir->o = IR_NOP;  /* Replace instruction with NOP. */
+      ir->op1 = ir->op2 = 0;
+      pchain[IR_NOP] = &ir->prev;
+      continue;
+    }
+    if (!irref_isk(ir->op1)) irt_setmark(IR(ir->op1)->t);
+    if (!irref_isk(ir->op2)) irt_setmark(IR(ir->op2)->t);
+  }
+  *pchain[IR_NOP] = 0;  /* Terminate NOP chain. */
+}
+
+/* Dead Code Elimination.
+**
+** First backpropagate marks for all used instructions. Then replace
+** the unused ones with a NOP. Note that compressing the IR to eliminate
+** the NOPs does not pay off.
+*/
+void lj_opt_dce(jit_State *J)
+{
+  if ((J->flags & JIT_F_OPT_DCE)) {
+    dce_marksnap(J);
+    dce_propagate(J);
+  }
+}
+
+#undef IR
+
+#endif

+ 1415 - 0
src/lj_opt_fold.c

@@ -0,0 +1,1415 @@
+/*
+** FOLD: Constant Folding, Algebraic Simplifications and Reassociation.
+** CSE: Common-Subexpression Elimination.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_fold_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_vm.h"
+
+/* Here's a short description how the FOLD engine processes instructions:
+**
+** The FOLD engine receives a single instruction stored in fins (J->fold.ins).
+** The instruction and its operands are used to select matching fold rules.
+** These are applied iteratively until a fixed point is reached.
+**
+** The 8 bit opcode of the instruction itself plus the opcodes of the
+** two instructions referenced by its operands form a 24 bit key
+** 'ins left right' (unused operands -> 0, literals -> lowest 8 bits).
+**
+** This key is used for partial matching against the fold rules. The
+** left/right operand fields of the key are successively masked with
+** the 'any' wildcard, from most specific to least specific:
+**
+**   ins left right
+**   ins any  right
+**   ins left any
+**   ins any  any
+**
+** The masked key is used to lookup a matching fold rule in a semi-perfect
+** hash table. If a matching rule is found, the related fold function is run.
+** Multiple rules can share the same fold function. A fold rule may return
+** one of several special values:
+**
+** - NEXTFOLD means no folding was applied, because an additional test
+**   inside the fold function failed. Matching continues against less
+**   specific fold rules. Finally the instruction is passed on to CSE.
+**
+** - RETRYFOLD means the instruction was modified in-place. Folding is
+**   retried as if this instruction had just been received.
+**
+** All other return values are terminal actions -- no further folding is
+** applied:
+**
+** - INTFOLD(i) returns a reference to the integer constant i.
+**
+** - LEFTFOLD and RIGHTFOLD return the left/right operand reference
+**   without emitting an instruction.
+**
+** - CSEFOLD and EMITFOLD pass the instruction directly to CSE or emit
+**   it without passing through any further optimizations.
+**
+** - FAILFOLD, DROPFOLD and CONDFOLD only apply to instructions which have
+**   no result (e.g. guarded assertions): FAILFOLD means the guard would
+**   always fail, i.e. the current trace is pointless. DROPFOLD means
+**   the guard is always true and has been eliminated. CONDFOLD is a
+**   shortcut for FAILFOLD + cond (i.e. drop if true, otherwise fail).
+**
+** - Any other return value is interpreted as an IRRef or TRef. This
+**   can be a reference to an existing or a newly created instruction.
+**   Only the least-significant 16 bits (IRRef1) are used to form a TRef
+**   which is finally returned to the caller.
+**
+** The FOLD engine receives instructions both from the trace recorder and
+** substituted instructions from LOOP unrolling. This means all types
+** of instructions may end up here, even though the recorder bypasses
+** FOLD in some cases. Thus all loads, stores and allocations must have
+** an any/any rule to avoid being passed on to CSE.
+**
+** Carefully read the following requirements before adding or modifying
+** any fold rules:
+**
+** Requirement #1: All fold rules must preserve their destination type.
+**
+** Consistently use INTFOLD() (KINT result) or lj_ir_knum() (KNUM result).
+** Never use lj_ir_knumint() which can have either a KINT or KNUM result.
+**
+** Requirement #2: Fold rules should not create *new* instructions which
+** reference operands *across* PHIs.
+**
+** E.g. a RETRYFOLD with 'fins->op1 = fleft->op1' is invalid if the
+** left operand is a PHI. Then fleft->op1 would point across the PHI
+** frontier to an invariant instruction. Adding a PHI for this instruction
+** would be counterproductive. The solution is to add a barrier which
+** prevents folding across PHIs, i.e. 'PHIBARRIER(fleft)' in this case.
+** The only exception is for recurrences with high latencies like
+** repeated int->num->int conversions.
+**
+** One could relax this condition a bit if the referenced instruction is
+** a PHI, too. But this often leads to worse code due to excessive
+** register shuffling.
+**
+** Note: returning *existing* instructions (e.g. LEFTFOLD) is ok, though.
+** Even returning fleft->op1 would be ok, because a new PHI will added,
+** if needed. But again, this leads to excessive register shuffling and
+** should be avoided.
+**
+** Requirement #3: The set of all fold rules must be monotonic to guarantee
+** termination.
+**
+** The goal is optimization, so one primarily wants to add strength-reducing
+** rules. This means eliminating an instruction or replacing an instruction
+** with one or more simpler instructions. Don't add fold rules which point
+** into the other direction.
+**
+** Some rules (like commutativity) do not directly reduce the strength of
+** an instruction, but enable other fold rules (e.g. by moving constants
+** to the right operand). These rules must be made unidirectional to avoid
+** cycles.
+**
+** Rule of thumb: the trace recorder expands the IR and FOLD shrinks it.
+*/
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+#define fins		(&J->fold.ins)
+#define fleft		(&J->fold.left)
+#define fright		(&J->fold.right)
+#define knumleft	(ir_knum(fleft)->n)
+#define knumright	(ir_knum(fright)->n)
+
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
+/* Fold function type. Fastcall on x86 significantly reduces their size. */
+typedef IRRef (LJ_FASTCALL *FoldFunc)(jit_State *J);
+
+/* Macros for the fold specs, so buildvm can recognize them. */
+#define LJFOLD(x)
+#define LJFOLDX(x)
+#define LJFOLDF(name)	static TRef LJ_FASTCALL name(jit_State *J)
+/* Note: They must be at the start of a line or buildvm ignores them! */
+
+/* Barrier to prevent using operands across PHIs. */
+#define PHIBARRIER(ir)	if (irt_isphi((ir)->t)) return NEXTFOLD
+
+/* Barrier to prevent folding across a GC step.
+** GC steps can only happen at the head of a trace and at LOOP.
+** And the GC is only driven forward if there is at least one allocation.
+*/
+#define gcstep_barrier(J, ref) \
+  ((ref) < J->chain[IR_LOOP] && \
+   (J->chain[IR_TNEW] || J->chain[IR_TDUP] || \
+    J->chain[IR_SNEW] || J->chain[IR_TOSTR]))
+
+/* -- Constant folding ---------------------------------------------------- */
+
+LJFOLD(ADD KNUM KNUM)
+LJFOLD(SUB KNUM KNUM)
+LJFOLD(MUL KNUM KNUM)
+LJFOLD(DIV KNUM KNUM)
+LJFOLD(NEG KNUM KNUM)
+LJFOLD(ABS KNUM KNUM)
+LJFOLD(ATAN2 KNUM KNUM)
+LJFOLD(LDEXP KNUM KNUM)
+LJFOLD(MIN KNUM KNUM)
+LJFOLD(MAX KNUM KNUM)
+LJFOLDF(kfold_numarith)
+{
+  lua_Number a = knumleft;
+  lua_Number b = knumright;
+  lua_Number y = lj_vm_foldarith(a, b, fins->o - IR_ADD);
+  return lj_ir_knum(J, y);
+}
+
+LJFOLD(FPMATH KNUM any)
+LJFOLDF(kfold_fpmath)
+{
+  lua_Number a = knumleft;
+  lua_Number y = lj_vm_foldfpm(a, fins->op2);
+  return lj_ir_knum(J, y);
+}
+
+LJFOLD(POWI KNUM KINT)
+LJFOLDF(kfold_powi)
+{
+  lua_Number a = knumleft;
+  lua_Number b = cast_num(fright->i);
+  lua_Number y = lj_vm_foldarith(a, b, IR_POWI - IR_ADD);
+  return lj_ir_knum(J, y);
+}
+
+static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op)
+{
+  switch (op) {
+  case IR_ADD: k1 += k2; break;
+  case IR_SUB: k1 -= k2; break;
+  case IR_BAND: k1 &= k2; break;
+  case IR_BOR: k1 |= k2; break;
+  case IR_BXOR: k1 ^= k2; break;
+  case IR_BSHL: k1 <<= (k2 & 31); break;
+  case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 31)); break;
+  case IR_BSAR: k1 >>= (k2 & 31); break;
+  case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 31)); break;
+  case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 31)); break;
+  default: lua_assert(0); break;
+  }
+  return k1;
+}
+
+LJFOLD(ADD KINT KINT)
+LJFOLD(SUB KINT KINT)
+LJFOLD(BAND KINT KINT)
+LJFOLD(BOR KINT KINT)
+LJFOLD(BXOR KINT KINT)
+LJFOLD(BSHL KINT KINT)
+LJFOLD(BSHR KINT KINT)
+LJFOLD(BSAR KINT KINT)
+LJFOLD(BROL KINT KINT)
+LJFOLD(BROR KINT KINT)
+LJFOLDF(kfold_intarith)
+{
+  return INTFOLD(kfold_intop(fleft->i, fright->i, (IROp)fins->o));
+}
+
+LJFOLD(BNOT KINT)
+LJFOLDF(kfold_bnot)
+{
+  return INTFOLD(~fleft->i);
+}
+
+LJFOLD(BSWAP KINT)
+LJFOLDF(kfold_bswap)
+{
+  return INTFOLD((int32_t)lj_bswap((uint32_t)fleft->i));
+}
+
+LJFOLD(TONUM KINT)
+LJFOLDF(kfold_tonum)
+{
+  return lj_ir_knum(J, cast_num(fleft->i));
+}
+
+LJFOLD(TOBIT KNUM KNUM)
+LJFOLDF(kfold_tobit)
+{
+  TValue tv;
+  tv.n = knumleft + knumright;
+  return INTFOLD((int32_t)tv.u32.lo);
+}
+
+LJFOLD(TOINT KNUM any)
+LJFOLDF(kfold_toint)
+{
+  lua_Number n = knumleft;
+  int32_t k = lj_num2int(n);
+  if (irt_isguard(fins->t) && n != cast_num(k)) {
+    /* We're about to create a guard which always fails, like TOINT +1.5.
+    ** Some pathological loops cause this during LICM, e.g.:
+    **   local x,k,t = 0,1.5,{1,[1.5]=2}
+    **   for i=1,200 do x = x+ t[k]; k = k == 1 and 1.5 or 1 end
+    **   assert(x == 300)
+    */
+    return FAILFOLD;
+  }
+  return INTFOLD(k);
+}
+
+LJFOLD(TOSTR KNUM)
+LJFOLDF(kfold_tostr_knum)
+{
+  return lj_ir_kstr(J, lj_str_fromnum(J->L, &knumleft));
+}
+
+LJFOLD(TOSTR KINT)
+LJFOLDF(kfold_tostr_kint)
+{
+  return lj_ir_kstr(J, lj_str_fromint(J->L, fleft->i));
+}
+
+LJFOLD(STRTO KGC)
+LJFOLDF(kfold_strto)
+{
+  TValue n;
+  if (lj_str_numconv(strdata(ir_kstr(fleft)), &n))
+    return lj_ir_knum(J, numV(&n));
+  return FAILFOLD;
+}
+
+LJFOLD(SNEW STRREF KINT)
+LJFOLDF(kfold_snew)
+{
+  if (fright->i == 0)
+    return lj_ir_kstr(J, lj_str_new(J->L, "", 0));
+  PHIBARRIER(fleft);
+  if (irref_isk(fleft->op1) && irref_isk(fleft->op2)) {
+    const char *s = strdata(ir_kstr(IR(fleft->op1)));
+    int32_t ofs = IR(fleft->op2)->i;
+    return lj_ir_kstr(J, lj_str_new(J->L, s+ofs, (size_t)fright->i));
+  }
+  return NEXTFOLD;
+}
+
+/* Must not use kfold_kref for numbers (could be NaN). */
+LJFOLD(EQ KNUM KNUM)
+LJFOLD(NE KNUM KNUM)
+LJFOLD(LT KNUM KNUM)
+LJFOLD(GE KNUM KNUM)
+LJFOLD(LE KNUM KNUM)
+LJFOLD(GT KNUM KNUM)
+LJFOLD(ULT KNUM KNUM)
+LJFOLD(UGE KNUM KNUM)
+LJFOLD(ULE KNUM KNUM)
+LJFOLD(UGT KNUM KNUM)
+LJFOLDF(kfold_numcomp)
+{
+  return CONDFOLD(lj_ir_numcmp(knumleft, knumright, (IROp)fins->o));
+}
+
+LJFOLD(LT KINT KINT)
+LJFOLD(GE KINT KINT)
+LJFOLD(LE KINT KINT)
+LJFOLD(GT KINT KINT)
+LJFOLD(ULT KINT KINT)
+LJFOLD(UGE KINT KINT)
+LJFOLD(ULE KINT KINT)
+LJFOLD(UGT KINT KINT)
+LJFOLD(ABC KINT KINT)
+LJFOLDF(kfold_intcomp)
+{
+  int32_t a = fleft->i, b = fright->i;
+  switch ((IROp)fins->o) {
+  case IR_LT: return CONDFOLD(a < b);
+  case IR_GE: return CONDFOLD(a >= b);
+  case IR_LE: return CONDFOLD(a <= b);
+  case IR_GT: return CONDFOLD(a > b);
+  case IR_ULT: return CONDFOLD((uint32_t)a < (uint32_t)b);
+  case IR_UGE: return CONDFOLD((uint32_t)a >= (uint32_t)b);
+  case IR_ULE: return CONDFOLD((uint32_t)a <= (uint32_t)b);
+  case IR_ABC:
+  case IR_UGT: return CONDFOLD((uint32_t)a > (uint32_t)b);
+  default: lua_assert(0); return FAILFOLD;
+  }
+}
+
+LJFOLD(LT KGC KGC)
+LJFOLD(GE KGC KGC)
+LJFOLD(LE KGC KGC)
+LJFOLD(GT KGC KGC)
+LJFOLDF(kfold_strcomp)
+{
+  if (irt_isstr(fins->t)) {
+    GCstr *a = ir_kstr(fleft);
+    GCstr *b = ir_kstr(fright);
+    return CONDFOLD(lj_ir_strcmp(a, b, (IROp)fins->o));
+  }
+  return NEXTFOLD;
+}
+
+/* Don't constant-fold away FLOAD checks against KNULL. */
+LJFOLD(EQ FLOAD KNULL)
+LJFOLD(NE FLOAD KNULL)
+LJFOLDX(lj_opt_cse)
+
+/* But fold all other KNULL compares, since only KNULL is equal to KNULL. */
+LJFOLD(EQ any KNULL)
+LJFOLD(NE any KNULL)
+LJFOLD(EQ KNULL any)
+LJFOLD(NE KNULL any)
+LJFOLD(EQ KINT KINT)  /* Constants are unique, so same refs <==> same value. */
+LJFOLD(NE KINT KINT)
+LJFOLD(EQ KGC KGC)
+LJFOLD(NE KGC KGC)
+LJFOLDF(kfold_kref)
+{
+  return CONDFOLD((fins->op1 == fins->op2) ^ (fins->o == IR_NE));
+}
+
+/* -- Algebraic shortcuts ------------------------------------------------- */
+
+LJFOLD(FPMATH FPMATH IRFPM_FLOOR)
+LJFOLD(FPMATH FPMATH IRFPM_CEIL)
+LJFOLD(FPMATH FPMATH IRFPM_TRUNC)
+LJFOLDF(shortcut_round)
+{
+  IRFPMathOp op = (IRFPMathOp)fleft->op2;
+  if (op == IRFPM_FLOOR || op == IRFPM_CEIL || op == IRFPM_TRUNC)
+    return LEFTFOLD;  /* round(round_left(x)) = round_left(x) */
+  return NEXTFOLD;
+}
+
+LJFOLD(FPMATH TONUM IRFPM_FLOOR)
+LJFOLD(FPMATH TONUM IRFPM_CEIL)
+LJFOLD(FPMATH TONUM IRFPM_TRUNC)
+LJFOLD(ABS ABS KNUM)
+LJFOLDF(shortcut_left)
+{
+  return LEFTFOLD;  /* f(g(x)) ==> g(x) */
+}
+
+LJFOLD(ABS NEG KNUM)
+LJFOLDF(shortcut_dropleft)
+{
+  PHIBARRIER(fleft);
+  fins->op1 = fleft->op1;  /* abs(neg(x)) ==> abs(x) */
+  return RETRYFOLD;
+}
+
+/* Note: no safe shortcuts with STRTO and TOSTR ("1e2" ==> +100 ==> "100"). */
+LJFOLD(NEG NEG KNUM)
+LJFOLD(BNOT BNOT)
+LJFOLD(BSWAP BSWAP)
+LJFOLDF(shortcut_leftleft)
+{
+  PHIBARRIER(fleft);  /* See above. Fold would be ok, but not beneficial. */
+  return fleft->op1;  /* f(g(x)) ==> x */
+}
+
+LJFOLD(TONUM TOINT)
+LJFOLDF(shortcut_leftleft_toint)
+{
+  PHIBARRIER(fleft);
+  if (irt_isguard(fleft->t))  /* Only safe with a guarded TOINT. */
+    return fleft->op1;  /* f(g(x)) ==> x */
+  return NEXTFOLD;
+}
+
+LJFOLD(TOINT TONUM any)
+LJFOLD(TOBIT TONUM KNUM)  /* The inverse must NOT be shortcut! */
+LJFOLDF(shortcut_leftleft_across_phi)
+{
+  /* Fold even across PHI to avoid expensive int->num->int conversions. */
+  return fleft->op1;  /* f(g(x)) ==> x */
+}
+
+/* -- FP algebraic simplifications ---------------------------------------- */
+
+/* FP arithmetic is tricky -- there's not much to simplify.
+** Please note the following common pitfalls before sending "improvements":
+**   x+0 ==> x  is INVALID for x=-0
+**   0-x ==> -x is INVALID for x=+0
+**   x*0 ==> 0  is INVALID for x=-0, x=+-Inf or x=NaN
+*/
+
+LJFOLD(ADD NEG any)
+LJFOLDF(simplify_numadd_negx)
+{
+  PHIBARRIER(fleft);
+  fins->o = IR_SUB;  /* (-a) + b ==> b - a */
+  fins->op1 = fins->op2;
+  fins->op2 = fleft->op1;
+  return RETRYFOLD;
+}
+
+LJFOLD(ADD any NEG)
+LJFOLDF(simplify_numadd_xneg)
+{
+  PHIBARRIER(fright);
+  fins->o = IR_SUB;  /* a + (-b) ==> a - b */
+  fins->op2 = fright->op1;
+  return RETRYFOLD;
+}
+
+LJFOLD(SUB any KNUM)
+LJFOLDF(simplify_numsub_k)
+{
+  lua_Number n = knumright;
+  if (n == 0.0)  /* x - (+-0) ==> x */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB NEG KNUM)
+LJFOLDF(simplify_numsub_negk)
+{
+  PHIBARRIER(fleft);
+  fins->op2 = fleft->op1;  /* (-x) - k ==> (-k) - x */
+  fins->op1 = (IRRef1)lj_ir_knum(J, -knumright);
+  return RETRYFOLD;
+}
+
+LJFOLD(SUB any NEG)
+LJFOLDF(simplify_numsub_xneg)
+{
+  PHIBARRIER(fright);
+  fins->o = IR_ADD;  /* a - (-b) ==> a + b */
+  fins->op2 = fright->op1;
+  return RETRYFOLD;
+}
+
+LJFOLD(MUL any KNUM)
+LJFOLD(DIV any KNUM)
+LJFOLDF(simplify_nummuldiv_k)
+{
+  lua_Number n = knumright;
+  if (n == 1.0) {  /* x o 1 ==> x */
+    return LEFTFOLD;
+  } else if (n == -1.0) {  /* x o -1 ==> -x */
+    fins->o = IR_NEG;
+    fins->op2 = (IRRef1)lj_ir_knum_neg(J);
+    return RETRYFOLD;
+  } else if (fins->o == IR_MUL && n == 2.0) {  /* x * 2 ==> x + x */
+    fins->o = IR_ADD;
+    fins->op2 = fins->op1;
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(MUL NEG KNUM)
+LJFOLD(DIV NEG KNUM)
+LJFOLDF(simplify_nummuldiv_negk)
+{
+  PHIBARRIER(fleft);
+  fins->op1 = fleft->op1;  /* (-a) o k ==> a o (-k) */
+  fins->op2 = (IRRef1)lj_ir_knum(J, -knumright);
+  return RETRYFOLD;
+}
+
+LJFOLD(MUL NEG NEG)
+LJFOLD(DIV NEG NEG)
+LJFOLDF(simplify_nummuldiv_negneg)
+{
+  PHIBARRIER(fleft);
+  PHIBARRIER(fright);
+  fins->op1 = fleft->op1;  /* (-a) o (-b) ==> a o b */
+  fins->op2 = fright->op1;
+  return RETRYFOLD;
+}
+
+LJFOLD(POWI any KINT)
+LJFOLDF(simplify_powi_xk)
+{
+  int32_t k = fright->i;
+  TRef ref = fins->op1;
+  if (k == 0)  /* x ^ 0 ==> 1 */
+    return lj_ir_knum_one(J);  /* Result must be a number, not an int. */
+  if (k == 1)  /* x ^ 1 ==> x */
+    return LEFTFOLD;
+  if ((uint32_t)(k+65536) > 2*65536u)  /* Limit code explosion. */
+    return NEXTFOLD;
+  if (k < 0) {  /* x ^ (-k) ==> (1/x) ^ k. */
+    ref = emitir(IRTN(IR_DIV), lj_ir_knum_one(J), ref);
+    k = -k;
+  }
+  /* Unroll x^k for 1 <= k <= 65536. */
+  for (; (k & 1) == 0; k >>= 1)  /* Handle leading zeros. */
+    ref = emitir(IRTN(IR_MUL), ref, ref);
+  if ((k >>= 1) != 0) {  /* Handle trailing bits. */
+    TRef tmp = emitir(IRTN(IR_MUL), ref, ref);
+    for (; k != 1; k >>= 1) {
+      if (k & 1)
+	ref = emitir(IRTN(IR_MUL), ref, tmp);
+      tmp = emitir(IRTN(IR_MUL), tmp, tmp);
+    }
+    ref = emitir(IRTN(IR_MUL), ref, tmp);
+  }
+  return ref;
+}
+
+LJFOLD(POWI KNUM any)
+LJFOLDF(simplify_powi_kx)
+{
+  lua_Number n = knumleft;
+  if (n == 2.0) {  /* 2.0 ^ i ==> ldexp(1.0, tonum(i)) */
+    fins->o = IR_TONUM;
+    fins->op1 = fins->op2;
+    fins->op2 = 0;
+    fins->op2 = (IRRef1)lj_opt_fold(J);
+    fins->op1 = (IRRef1)lj_ir_knum_one(J);
+    fins->o = IR_LDEXP;
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+/* -- FP conversion narrowing --------------------------------------------- */
+
+LJFOLD(TOINT ADD any)
+LJFOLD(TOINT SUB any)
+LJFOLD(TOBIT ADD KNUM)
+LJFOLD(TOBIT SUB KNUM)
+LJFOLDF(narrow_convert)
+{
+  PHIBARRIER(fleft);
+  /* Narrowing ignores PHIs and repeating it inside the loop is not useful. */
+  if (J->chain[IR_LOOP])
+    return NEXTFOLD;
+  return lj_opt_narrow_convert(J);
+}
+
+/* Relaxed CSE rule for TOINT allows commoning with stronger checks, too. */
+LJFOLD(TOINT any any)
+LJFOLDF(cse_toint)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
+    IRRef ref, op1 = fins->op1;
+    uint8_t guard = irt_isguard(fins->t);
+    for (ref = J->chain[IR_TOINT]; ref > op1; ref = IR(ref)->prev)
+      if (IR(ref)->op1 == op1 && irt_isguard(IR(ref)->t) >= guard)
+	return ref;
+  }
+  return EMITFOLD;  /* No fallthrough to regular CSE. */
+}
+
+/* -- Integer algebraic simplifications ----------------------------------- */
+
+LJFOLD(ADD any KINT)
+LJFOLD(ADDOV any KINT)
+LJFOLD(SUBOV any KINT)
+LJFOLDF(simplify_intadd_k)
+{
+  if (fright->i == 0)  /* i o 0 ==> i */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB any KINT)
+LJFOLDF(simplify_intsub_k)
+{
+  if (fright->i == 0)  /* i - 0 ==> i */
+    return LEFTFOLD;
+  fins->o = IR_ADD;  /* i - k ==> i + (-k) */
+  fins->op2 = (IRRef1)lj_ir_kint(J, -fright->i);  /* Overflow for -2^31 ok. */
+  return RETRYFOLD;
+}
+
+LJFOLD(SUB any any)
+LJFOLD(SUBOV any any)
+LJFOLDF(simplify_intsub)
+{
+  if (fins->op1 == fins->op2 && !irt_isnum(fins->t))  /* i - i ==> 0 */
+    return INTFOLD(0);
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB ADD any)
+LJFOLDF(simplify_intsubadd_leftcancel)
+{
+  if (!irt_isnum(fins->t)) {
+    PHIBARRIER(fleft);
+    if (fins->op2 == fleft->op1)  /* (i + j) - i ==> j */
+      return fleft->op2;
+    if (fins->op2 == fleft->op2)  /* (i + j) - j ==> i */
+      return fleft->op1;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB SUB any)
+LJFOLDF(simplify_intsubsub_leftcancel)
+{
+  if (!irt_isnum(fins->t)) {
+    PHIBARRIER(fleft);
+    if (fins->op1 == fleft->op1) {  /* (i - j) - i ==> 0 - j */
+      fins->op1 = (IRRef1)lj_ir_kint(J, 0);
+      fins->op2 = fleft->op2;
+      return RETRYFOLD;
+    }
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB any SUB)
+LJFOLDF(simplify_intsubsub_rightcancel)
+{
+  if (!irt_isnum(fins->t)) {
+    PHIBARRIER(fright);
+    if (fins->op1 == fright->op1)  /* i - (i - j) ==> j */
+      return fright->op2;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB any ADD)
+LJFOLDF(simplify_intsubadd_rightcancel)
+{
+  if (!irt_isnum(fins->t)) {
+    PHIBARRIER(fright);
+    if (fins->op1 == fright->op1) {  /* i - (i + j) ==> 0 - j */
+      fins->op2 = fright->op2;
+      fins->op1 = (IRRef1)lj_ir_kint(J, 0);
+      return RETRYFOLD;
+    }
+    if (fins->op1 == fright->op2) {  /* i - (j + i) ==> 0 - j */
+      fins->op2 = fright->op1;
+      fins->op1 = (IRRef1)lj_ir_kint(J, 0);
+      return RETRYFOLD;
+    }
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(SUB ADD ADD)
+LJFOLDF(simplify_intsubaddadd_cancel)
+{
+  if (!irt_isnum(fins->t)) {
+    PHIBARRIER(fleft);
+    PHIBARRIER(fright);
+    if (fleft->op1 == fright->op1) {  /* (i + j1) - (i + j2) ==> j1 - j2 */
+      fins->op1 = fleft->op2;
+      fins->op2 = fright->op2;
+      return RETRYFOLD;
+    }
+    if (fleft->op1 == fright->op2) {  /* (i + j1) - (j2 + i) ==> j1 - j2 */
+      fins->op1 = fleft->op2;
+      fins->op2 = fright->op1;
+      return RETRYFOLD;
+    }
+    if (fleft->op2 == fright->op1) {  /* (j1 + i) - (i + j2) ==> j1 - j2 */
+      fins->op1 = fleft->op1;
+      fins->op2 = fright->op2;
+      return RETRYFOLD;
+    }
+    if (fleft->op2 == fright->op2) {  /* (j1 + i) - (j2 + i) ==> j1 - j2 */
+      fins->op1 = fleft->op1;
+      fins->op2 = fright->op1;
+      return RETRYFOLD;
+    }
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(BAND any KINT)
+LJFOLDF(simplify_band_k)
+{
+  if (fright->i == 0)  /* i & 0 ==> 0 */
+    return RIGHTFOLD;
+  if (fright->i == -1)  /* i & -1 ==> i */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
+LJFOLD(BOR any KINT)
+LJFOLDF(simplify_bor_k)
+{
+  if (fright->i == 0)  /* i | 0 ==> i */
+    return LEFTFOLD;
+  if (fright->i == -1)  /* i | -1 ==> -1 */
+    return RIGHTFOLD;
+  return NEXTFOLD;
+}
+
+LJFOLD(BXOR any KINT)
+LJFOLDF(simplify_bxor_k)
+{
+  if (fright->i == 0)  /* i xor 0 ==> i */
+    return LEFTFOLD;
+  if (fright->i == -1) {  /* i xor -1 ==> ~i */
+    fins->o = IR_BNOT;
+    fins->op2 = 0;
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(BSHL any KINT)
+LJFOLD(BSHR any KINT)
+LJFOLD(BSAR any KINT)
+LJFOLD(BROL any KINT)
+LJFOLD(BROR any KINT)
+LJFOLDF(simplify_shift_ik)
+{
+  int32_t k = (fright->i & 31);
+  if (k == 0)  /* i o 0 ==> i */
+    return LEFTFOLD;
+  if (k != fright->i) {  /* i o k ==> i o (k & 31) */
+    fins->op2 = (IRRef1)lj_ir_kint(J, k);
+    return RETRYFOLD;
+  }
+  if (fins->o == IR_BROR) {  /* bror(i, k) ==> brol(i, (-k)&31) */
+    fins->o = IR_BROL;
+    fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&31);
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(BSHL any BAND)
+LJFOLD(BSHR any BAND)
+LJFOLD(BSAR any BAND)
+LJFOLD(BROL any BAND)
+LJFOLD(BROR any BAND)
+LJFOLDF(simplify_shift_andk)
+{
+#if LJ_TARGET_MASKEDSHIFT
+  IRIns *irk = IR(fright->op2);
+  PHIBARRIER(fright);
+  if (irk->o == IR_KINT) {  /* i o (j & 31) ==> i o j */
+    int32_t k = irk->i & 31;
+    if (k == 31) {
+      fins->op2 = fright->op1;
+      return RETRYFOLD;
+    }
+  }
+#endif
+  return NEXTFOLD;
+}
+
+LJFOLD(BSHL KINT any)
+LJFOLD(BSHR KINT any)
+LJFOLDF(simplify_shift1_ki)
+{
+  if (fleft->i == 0)  /* 0 o i ==> 0 */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
+LJFOLD(BSAR KINT any)
+LJFOLD(BROL KINT any)
+LJFOLD(BROR KINT any)
+LJFOLDF(simplify_shift2_ki)
+{
+  if (fleft->i == 0 || fleft->i == -1)  /* 0 o i ==> 0; -1 o i ==> -1 */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
+/* -- Reassociation ------------------------------------------------------- */
+
+LJFOLD(ADD ADD KINT)
+LJFOLD(BAND BAND KINT)
+LJFOLD(BOR BOR KINT)
+LJFOLD(BXOR BXOR KINT)
+LJFOLDF(reassoc_intarith_k)
+{
+  IRIns *irk = IR(fleft->op2);
+  if (irk->o == IR_KINT) {
+    int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
+    if (k == irk->i)  /* (i o k1) o k2 ==> i o k1, if (k1 o k2) == k1. */
+      return LEFTFOLD;
+    PHIBARRIER(fleft);
+    fins->op1 = fleft->op1;
+    fins->op2 = (IRRef1)lj_ir_kint(J, k);
+    return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(MIN MIN any)
+LJFOLD(MAX MAX any)
+LJFOLD(BAND BAND any)
+LJFOLD(BOR BOR any)
+LJFOLDF(reassoc_dup)
+{
+  PHIBARRIER(fleft);
+  if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
+    return LEFTFOLD;  /* (a o b) o a ==> a o b; (a o b) o b ==> a o b */
+  return NEXTFOLD;
+}
+
+LJFOLD(BXOR BXOR any)
+LJFOLDF(reassoc_bxor)
+{
+  PHIBARRIER(fleft);
+  if (fins->op2 == fleft->op1)  /* (a xor b) xor a ==> b */
+    return fleft->op2;
+  if (fins->op2 == fleft->op2)  /* (a xor b) xor b ==> a */
+    return fleft->op1;
+  return NEXTFOLD;
+}
+
+LJFOLD(BSHL BSHL KINT)
+LJFOLD(BSHR BSHR KINT)
+LJFOLD(BSAR BSAR KINT)
+LJFOLD(BROL BROL KINT)
+LJFOLD(BROR BROR KINT)
+LJFOLDF(reassoc_shift)
+{
+  IRIns *irk = IR(fleft->op2);
+  PHIBARRIER(fleft);  /* The (shift any KINT) rule covers k2 == 0 and more. */
+  if (irk->o == IR_KINT) {  /* (i o k1) o k2 ==> i o (k1 + k2) */
+    int32_t k = (irk->i & 31) + (fright->i & 31);
+    if (k > 31) {  /* Combined shift too wide? */
+      if (fins->o == IR_BSHL || fins->o == IR_BSHR)
+	return INTFOLD(0);
+      else if (fins->o == IR_BSAR)
+	k = 31;
+      else
+	k &= 31;
+    }
+    fins->op1 = fleft->op1;
+    fins->op2 = (IRRef1)lj_ir_kint(J, k);
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(MIN MIN KNUM)
+LJFOLD(MAX MAX KNUM)
+LJFOLDF(reassoc_minmax_k)
+{
+  IRIns *irk = IR(fleft->op2);
+  if (irk->o == IR_KNUM) {
+    lua_Number a = ir_knum(irk)->n;
+    lua_Number b = knumright;
+    lua_Number y = lj_vm_foldarith(a, b, fins->o - IR_ADD);
+    if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
+      return LEFTFOLD;
+    PHIBARRIER(fleft);
+    fins->op1 = fleft->op1;
+    fins->op2 = (IRRef1)lj_ir_knum(J, y);
+    return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(MIN MAX any)
+LJFOLD(MAX MIN any)
+LJFOLDF(reassoc_minmax_left)
+{
+  if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
+    return RIGHTFOLD;  /* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */
+  return NEXTFOLD;
+}
+
+LJFOLD(MIN any MAX)
+LJFOLD(MAX any MIN)
+LJFOLDF(reassoc_minmax_right)
+{
+  if (fins->op1 == fright->op1 || fins->op1 == fright->op2)
+    return LEFTFOLD;  /* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */
+  return NEXTFOLD;
+}
+
+/* Eliminate ABC across PHIs to handle t[i-1] forwarding case.
+** ABC(asize, (i+k)+(-k)) ==> ABC(asize, i), but only if it already exists.
+** Could be generalized to (i+k1)+k2 ==> i+(k1+k2), but needs better disambig.
+*/
+LJFOLD(ABC any ADD)
+LJFOLDF(reassoc_abc)
+{
+  if (irref_isk(fright->op2)) {
+    IRIns *add2 = IR(fright->op1);
+    if (add2->o == IR_ADD && irref_isk(add2->op2) &&
+	IR(fright->op2)->i == -IR(add2->op2)->i) {
+      IRRef ref = J->chain[IR_ABC];
+      IRRef lim = add2->op1;
+      if (fins->op1 > lim) lim = fins->op1;
+      while (ref > lim) {
+	IRIns *ir = IR(ref);
+	if (ir->op1 == fins->op1 && ir->op2 == add2->op1)
+	  return DROPFOLD;
+	ref = ir->prev;
+      }
+    }
+  }
+  return NEXTFOLD;
+}
+
+/* -- Commutativity ------------------------------------------------------- */
+
+/* The refs of commutative ops are canonicalized. Lower refs go to the right.
+** Rationale behind this:
+** - It (also) moves constants to the right.
+** - It reduces the number of FOLD rules (e.g. (BOR any KINT) suffices).
+** - It helps CSE to find more matches.
+** - The assembler generates better code with constants at the right.
+*/
+
+LJFOLD(ADD any any)
+LJFOLD(MUL any any)
+LJFOLD(ADDOV any any)
+LJFOLDF(comm_swap)
+{
+  if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
+    IRRef1 tmp = fins->op1;
+    fins->op1 = fins->op2;
+    fins->op2 = tmp;
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(EQ any any)
+LJFOLD(NE any any)
+LJFOLDF(comm_equal)
+{
+  /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
+  if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
+    return CONDFOLD(fins->o == IR_EQ);
+  return comm_swap(J);
+}
+
+LJFOLD(LT any any)
+LJFOLD(GE any any)
+LJFOLD(LE any any)
+LJFOLD(GT any any)
+LJFOLD(ULT any any)
+LJFOLD(UGE any any)
+LJFOLD(ULE any any)
+LJFOLD(UGT any any)
+LJFOLDF(comm_comp)
+{
+  /* For non-numbers only: x <=> x ==> drop; x <> x ==> fail */
+  if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
+    return CONDFOLD(fins->o & 1);
+  if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
+    IRRef1 tmp = fins->op1;
+    fins->op1 = fins->op2;
+    fins->op2 = tmp;
+    fins->o ^= 3; /* GT <-> LT, GE <-> LE, does not affect U */
+    return RETRYFOLD;
+  }
+  return NEXTFOLD;
+}
+
+LJFOLD(BAND any any)
+LJFOLD(BOR any any)
+LJFOLD(MIN any any)
+LJFOLD(MAX any any)
+LJFOLDF(comm_dup)
+{
+  if (fins->op1 == fins->op2)  /* x o x ==> x */
+    return LEFTFOLD;
+  return comm_swap(J);
+}
+
+LJFOLD(BXOR any any)
+LJFOLDF(comm_bxor)
+{
+  if (fins->op1 == fins->op2)  /* i xor i ==> 0 */
+    return INTFOLD(0);
+  return comm_swap(J);
+}
+
+/* -- Simplification of compound expressions ------------------------------ */
+
+static int32_t kfold_xload(IRIns *ir, const void *p)
+{
+#if !LJ_TARGET_X86ORX64
+#error "Missing support for unaligned loads"
+#endif
+  switch (irt_type(ir->t)) {
+  case IRT_I8: return (int32_t)*(int8_t *)p;
+  case IRT_U8: return (int32_t)*(uint8_t *)p;
+  case IRT_I16: return (int32_t)*(int16_t *)p;
+  case IRT_U16: return (int32_t)*(uint16_t *)p;
+  default: lua_assert(irt_isint(ir->t)); return (int32_t)*(int32_t *)p;
+  }
+}
+
+/* Turn: string.sub(str, a, b) == kstr
+** into: string.byte(str, a) == string.byte(kstr, 1) etc.
+** Note: this creates unaligned XLOADs!
+*/
+LJFOLD(EQ SNEW KGC)
+LJFOLD(NE SNEW KGC)
+LJFOLDF(merge_eqne_snew_kgc)
+{
+  GCstr *kstr = ir_kstr(fright);
+  int32_t len = (int32_t)kstr->len;
+  lua_assert(irt_isstr(fins->t));
+  if (len <= 4) {  /* Handle string lengths 0, 1, 2, 3, 4. */
+    IROp op = (IROp)fins->o;
+    IRRef strref = fleft->op1;
+    lua_assert(IR(strref)->o == IR_STRREF);
+    if (op == IR_EQ) {
+      emitir(IRTGI(IR_EQ), fleft->op2, lj_ir_kint(J, len));
+      /* Caveat: fins/fleft/fright is no longer valid after emitir. */
+    } else {
+      /* NE is not expanded since this would need an OR of two conds. */
+      if (!irref_isk(fleft->op2))  /* Only handle the constant length case. */
+	return NEXTFOLD;
+      if (IR(fleft->op2)->i != len)
+	return DROPFOLD;
+    }
+    if (len > 0) {
+      /* A 4 byte load for length 3 is ok -- all strings have an extra NUL. */
+      uint16_t ot = (uint16_t)(len == 1 ? IRT(IR_XLOAD, IRT_I8) :
+			       len == 2 ? IRT(IR_XLOAD, IRT_U16) :
+			       IRTI(IR_XLOAD));
+      TRef tmp = emitir(ot, strref, len > 1 ? IRXLOAD_UNALIGNED : 0);
+      TRef val = lj_ir_kint(J, kfold_xload(IR(tref_ref(tmp)), strdata(kstr)));
+      if (len == 3)
+	tmp = emitir(IRTI(IR_BAND), tmp,
+		     lj_ir_kint(J, LJ_ENDIAN_SELECT(0x00ffffff, 0xffffff00)));
+      fins->op1 = (IRRef1)tmp;
+      fins->op2 = (IRRef1)val;
+      fins->ot = (IROpT)IRTGI(op);
+      return RETRYFOLD;
+    } else {
+      return DROPFOLD;
+    }
+  }
+  return NEXTFOLD;
+}
+
+/* -- Loads --------------------------------------------------------------- */
+
+/* Loads cannot be folded or passed on to CSE in general.
+** Alias analysis is needed to check for forwarding opportunities.
+**
+** Caveat: *all* loads must be listed here or they end up at CSE!
+*/
+
+LJFOLD(ALOAD any)
+LJFOLDX(lj_opt_fwd_aload)
+
+LJFOLD(HLOAD any)
+LJFOLDX(lj_opt_fwd_hload)
+
+LJFOLD(ULOAD any)
+LJFOLDX(lj_opt_fwd_uload)
+
+LJFOLD(TLEN any)
+LJFOLDX(lj_opt_fwd_tlen)
+
+/* Upvalue refs are really loads, but there are no corresponding stores.
+** So CSE is ok for them, except for UREFO across a GC step (see below).
+** If the referenced function is const, its upvalue addresses are const, too.
+** This can be used to improve CSE by looking for the same address,
+** even if the upvalues originate from a different function.
+*/
+LJFOLD(UREFO KGC any)
+LJFOLD(UREFC KGC any)
+LJFOLDF(cse_uref)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
+    IRRef ref = J->chain[fins->o];
+    GCfunc *fn = ir_kfunc(fleft);
+    GCupval *uv = gco2uv(gcref(fn->l.uvptr[fins->op2]));
+    while (ref > 0) {
+      IRIns *ir = IR(ref);
+      if (irref_isk(ir->op1)) {
+	GCfunc *fn2 = ir_kfunc(IR(ir->op1));
+	if (gco2uv(gcref(fn2->l.uvptr[ir->op2])) == uv) {
+	  if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
+	    break;
+	  return ref;
+	}
+      }
+      ref = ir->prev;
+    }
+  }
+  return EMITFOLD;
+}
+
+/* We can safely FOLD/CSE array/hash refs and field loads, since there
+** are no corresponding stores. But NEWREF may invalidate all of them.
+** Lacking better disambiguation for table references, these optimizations
+** are simply disabled across any NEWREF.
+** Only HREF needs the NEWREF check -- AREF and HREFK already depend on
+** FLOADs. And NEWREF itself is treated like a store (see below).
+*/
+LJFOLD(HREF any any)
+LJFOLDF(cse_href)
+{
+  TRef tr = lj_opt_cse(J);
+  return tref_ref(tr) < J->chain[IR_NEWREF] ? EMITFOLD : tr;
+}
+
+LJFOLD(FLOAD TNEW IRFL_TAB_ASIZE)
+LJFOLDF(fload_tab_tnew_asize)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fins->op1 > J->chain[IR_NEWREF])
+    return INTFOLD(fleft->op1);
+  return NEXTFOLD;
+}
+
+LJFOLD(FLOAD TNEW IRFL_TAB_HMASK)
+LJFOLDF(fload_tab_tnew_hmask)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fins->op1 > J->chain[IR_NEWREF])
+    return INTFOLD((1 << fleft->op2)-1);
+  return NEXTFOLD;
+}
+
+LJFOLD(FLOAD TDUP IRFL_TAB_ASIZE)
+LJFOLDF(fload_tab_tdup_asize)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fins->op1 > J->chain[IR_NEWREF])
+    return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->asize);
+  return NEXTFOLD;
+}
+
+LJFOLD(FLOAD TDUP IRFL_TAB_HMASK)
+LJFOLDF(fload_tab_tdup_hmask)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fins->op1 > J->chain[IR_NEWREF])
+    return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->hmask);
+  return NEXTFOLD;
+}
+
+LJFOLD(FLOAD any IRFL_TAB_ARRAY)
+LJFOLD(FLOAD any IRFL_TAB_NODE)
+LJFOLD(FLOAD any IRFL_TAB_ASIZE)
+LJFOLD(FLOAD any IRFL_TAB_HMASK)
+LJFOLDF(fload_tab_ah)
+{
+  TRef tr = lj_opt_cse(J);
+  return tref_ref(tr) < J->chain[IR_NEWREF] ? EMITFOLD : tr;
+}
+
+/* Strings are immutable, so we can safely FOLD/CSE the related FLOAD. */
+LJFOLD(FLOAD KGC IRFL_STR_LEN)
+LJFOLDF(fload_str_len)
+{
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
+    return INTFOLD((int32_t)ir_kstr(fleft)->len);
+  return NEXTFOLD;
+}
+
+LJFOLD(FLOAD any IRFL_STR_LEN)
+LJFOLDX(lj_opt_cse)
+
+/* All other field loads need alias analysis. */
+LJFOLD(FLOAD any any)
+LJFOLDX(lj_opt_fwd_fload)
+
+/* This is for LOOP only. Recording handles SLOADs internally. */
+LJFOLD(SLOAD any any)
+LJFOLDF(fwd_sload)
+{
+  lua_assert(J->slot[fins->op1] != 0);
+  return J->slot[fins->op1];
+}
+
+/* Strings are immutable, so we can safely FOLD/CSE an XLOAD of a string. */
+LJFOLD(XLOAD STRREF any)
+LJFOLDF(xload_str)
+{
+  if (irref_isk(fleft->op1) && irref_isk(fleft->op2)) {
+    GCstr *str = ir_kstr(IR(fleft->op1));
+    int32_t ofs = IR(fleft->op2)->i;
+    lua_assert((MSize)ofs < str->len);
+    lua_assert((MSize)(ofs + (1<<((fins->op2>>8)&3))) <= str->len);
+    return INTFOLD(kfold_xload(fins, strdata(str)+ofs));
+  }
+  return CSEFOLD;
+}
+/* No XLOAD of non-strings (yet), so we don't need a (XLOAD any any) rule. */
+
+/* -- Write barriers ------------------------------------------------------ */
+
+/* Write barriers are amenable to CSE, but not across any incremental
+** GC steps.
+**
+** The same logic applies to open upvalue references, because the stack
+** may be resized during a GC step.
+*/
+LJFOLD(TBAR any)
+LJFOLD(OBAR any any)
+LJFOLD(UREFO any any)
+LJFOLDF(barrier_tab)
+{
+  TRef tr = lj_opt_cse(J);
+  if (gcstep_barrier(J, tref_ref(tr)))  /* CSE across GC step? */
+    return EMITFOLD;  /* Raw emit. Assumes fins is left intact by CSE. */
+  return tr;
+}
+
+LJFOLD(TBAR TNEW)
+LJFOLD(TBAR TDUP)
+LJFOLDF(barrier_tnew_tdup)
+{
+  /* New tables are always white and never need a barrier. */
+  if (fins->op1 < J->chain[IR_LOOP])  /* Except across a GC step. */
+    return NEXTFOLD;
+  return DROPFOLD;
+}
+
+/* -- Stores and allocations ---------------------------------------------- */
+
+/* Stores and allocations cannot be folded or passed on to CSE in general.
+** But some stores can be eliminated with dead-store elimination (DSE).
+**
+** Caveat: *all* stores and allocs must be listed here or they end up at CSE!
+*/
+
+LJFOLD(ASTORE any any)
+LJFOLD(HSTORE any any)
+LJFOLDX(lj_opt_dse_ahstore)
+
+LJFOLD(USTORE any any)
+LJFOLDX(lj_opt_dse_ustore)
+
+LJFOLD(FSTORE any any)
+LJFOLDX(lj_opt_dse_fstore)
+
+LJFOLD(NEWREF any any)  /* Treated like a store. */
+LJFOLD(TNEW any any)
+LJFOLD(TDUP any)
+LJFOLDF(store_raw)
+{
+  return EMITFOLD;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Every entry in the generated hash table is a 32 bit pattern:
+**
+** xxxxxxxx iiiiiiii llllllll rrrrrrrr
+**
+** xxxxxxxx = 8 bit index into fold function table
+** iiiiiiii = 8 bit folded instruction opcode
+** llllllll = 8 bit left instruction opcode
+** rrrrrrrr = 8 bit right instruction opcode or 8 bits from literal field
+*/
+
+#include "lj_folddef.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* Fold IR instruction. */
+TRef LJ_FASTCALL lj_opt_fold(jit_State *J)
+{
+  uint32_t key, any;
+  IRRef ref;
+
+  if (LJ_UNLIKELY((J->flags & JIT_F_OPT_MASK) != JIT_F_OPT_DEFAULT)) {
+    lua_assert(((JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE|JIT_F_OPT_DSE) |
+		JIT_F_OPT_DEFAULT) == JIT_F_OPT_DEFAULT);
+    /* Folding disabled? Chain to CSE, but not for loads/stores/allocs. */
+    if (!(J->flags & JIT_F_OPT_FOLD) && irm_kind(lj_ir_mode[fins->o]) == IRM_N)
+      return lj_opt_cse(J);
+
+    /* Forwarding or CSE disabled? Emit raw IR for loads, except for SLOAD. */
+    if ((J->flags & (JIT_F_OPT_FWD|JIT_F_OPT_CSE)) !=
+		    (JIT_F_OPT_FWD|JIT_F_OPT_CSE) &&
+	irm_kind(lj_ir_mode[fins->o]) == IRM_L && fins->o != IR_SLOAD)
+      return lj_ir_emit(J);
+
+    /* DSE disabled? Emit raw IR for stores. */
+    if (!(J->flags & JIT_F_OPT_DSE) && irm_kind(lj_ir_mode[fins->o]) == IRM_S)
+      return lj_ir_emit(J);
+  }
+
+  /* Fold engine start/retry point. */
+retry:
+  /* Construct key from opcode and operand opcodes (unless literal/none). */
+  key = ((uint32_t)fins->o << 16);
+  if (fins->op1 >= J->cur.nk) {
+    key += (uint32_t)IR(fins->op1)->o << 8;
+    *fleft = *IR(fins->op1);
+  }
+  if (fins->op2 >= J->cur.nk) {
+    key += (uint32_t)IR(fins->op2)->o;
+    *fright = *IR(fins->op2);
+  } else {
+    key += (fins->op2 & 0xffu);  /* For IRFPM_* and IRFL_*. */
+  }
+
+  /* Check for a match in order from most specific to least specific. */
+  any = 0;
+  for (;;) {
+    uint32_t k = key | any;
+    uint32_t h = fold_hashkey(k);
+    uint32_t fh = fold_hash[h];  /* Lookup key in semi-perfect hash table. */
+    if ((fh & 0xffffff) == k || (fh = fold_hash[h+1], (fh & 0xffffff) == k)) {
+      ref = (IRRef)tref_ref(fold_func[fh >> 24](J));
+      if (ref != NEXTFOLD)
+	break;
+    }
+    if (any == 0xffff)  /* Exhausted folding. Pass on to CSE. */
+      return lj_opt_cse(J);
+    any = (any | (any >> 8)) ^ 0xff00;
+  }
+
+  /* Return value processing, ordered by frequency. */
+  if (LJ_LIKELY(ref >= MAX_FOLD))
+    return TREF(ref, irt_t(IR(ref)->t));
+  if (ref == RETRYFOLD)
+    goto retry;
+  if (ref == KINTFOLD)
+    return lj_ir_kint(J, fins->i);
+  if (ref == FAILFOLD)
+    lj_trace_err(J, LJ_TRERR_GFAIL);
+  lua_assert(ref == DROPFOLD);
+  return REF_DROP;
+}
+
+/* -- Common-Subexpression Elimination ------------------------------------ */
+
+/* CSE an IR instruction. This is very fast due to the skip-list chains. */
+TRef LJ_FASTCALL lj_opt_cse(jit_State *J)
+{
+  /* Avoid narrow to wide store-to-load forwarding stall */
+  IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
+  IROp op = fins->o;
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
+    /* Limited search for same operands in per-opcode chain. */
+    IRRef ref = J->chain[op];
+    IRRef lim = fins->op1;
+    if (fins->op2 > lim) lim = fins->op2;  /* Relies on lit < REF_BIAS. */
+    while (ref > lim) {
+      if (IR(ref)->op12 == op12)
+	return TREF(ref, irt_t(IR(ref)->t));  /* Common subexpression found. */
+      ref = IR(ref)->prev;
+    }
+  }
+  /* Otherwise emit IR (inlined for speed). */
+  {
+    IRRef ref = lj_ir_nextins(J);
+    IRIns *ir = IR(ref);
+    ir->prev = J->chain[op];
+    ir->op12 = op12;
+    J->chain[op] = (IRRef1)ref;
+    ir->o = fins->o;
+    J->guardemit.irt |= fins->t.irt;
+    return TREF(ref, irt_t((ir->t = fins->t)));
+  }
+}
+
+/* ------------------------------------------------------------------------ */
+
+#undef IR
+#undef fins
+#undef fleft
+#undef fright
+#undef knumleft
+#undef knumright
+#undef emitir
+
+#endif

+ 358 - 0
src/lj_opt_loop.c

@@ -0,0 +1,358 @@
+/*
+** LOOP: Loop Optimizations.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_loop_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_snap.h"
+#include "lj_vm.h"
+
+/* Loop optimization:
+**
+** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
+** of a loop into invariant and variant instructions. The invariant
+** instructions are hoisted out of the loop and only the variant
+** instructions remain inside the loop body.
+**
+** Unfortunately LICM is mostly useless for compiling dynamic languages.
+** The IR has many guards and most of the subsequent instructions are
+** control-dependent on them. The first non-hoistable guard would
+** effectively prevent hoisting of all subsequent instructions.
+**
+** That's why we use a special form of unrolling using copy-substitution,
+** combined with redundancy elimination:
+**
+** The recorded instruction stream is re-emitted to the compiler pipeline
+** with substituted operands. The substitution table is filled with the
+** refs returned by re-emitting each instruction. This can be done
+** on-the-fly, because the IR is in strict SSA form, where every ref is
+** defined before its use.
+**
+** This aproach generates two code sections, separated by the LOOP
+** instruction:
+**
+** 1. The recorded instructions form a kind of pre-roll for the loop. It
+** contains a mix of invariant and variant instructions and performs
+** exactly one loop iteration (but not necessarily the 1st iteration).
+**
+** 2. The loop body contains only the variant instructions and performs
+** all remaining loop iterations.
+**
+** On first sight that looks like a waste of space, because the variant
+** instructions are present twice. But the key insight is that the
+** pre-roll honors the control-dependencies for *both* the pre-roll itself
+** *and* the loop body!
+**
+** It also means one doesn't have to explicitly model control-dependencies
+** (which, BTW, wouldn't help LICM much). And it's much easier to
+** integrate sparse snapshotting with this approach.
+**
+** One of the nicest aspects of this approach is that all of the
+** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
+** reused with only minor restrictions (e.g. one should not fold
+** instructions across loop-carried dependencies).
+**
+** But in general all optimizations can be applied which only need to look
+** backwards into the generated instruction stream. At any point in time
+** during the copy-substitution process this contains both a static loop
+** iteration (the pre-roll) and a dynamic one (from the to-be-copied
+** instruction up to the end of the partial loop body).
+**
+** Since control-dependencies are implicitly kept, CSE also applies to all
+** kinds of guards. The major advantage is that all invariant guards can
+** be hoisted, too.
+**
+** Load/store forwarding works across loop iterations, too. This is
+** important if loop-carried dependencies are kept in upvalues or tables.
+** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
+** become a forwarded loop-recurrence after inlining.
+**
+** Since the IR is in SSA form, loop-carried dependencies have to be
+** modeled with PHI instructions. The potential candidates for PHIs are
+** collected on-the-fly during copy-substitution. After eliminating the
+** redundant ones, PHI instructions are emitted *below* the loop body.
+**
+** Note that this departure from traditional SSA form doesn't change the
+** semantics of the PHI instructions themselves. But it greatly simplifies
+** on-the-fly generation of the IR and the machine code.
+*/
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
+/* Emit raw IR without passing through optimizations. */
+#define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+
+/* -- PHI elimination ----------------------------------------------------- */
+
+/* Emit or eliminate collected PHIs. */
+static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi)
+{
+  int pass2 = 0;
+  IRRef i, nslots;
+  IRRef invar = J->chain[IR_LOOP];
+  /* Pass #1: mark redundant and potentially redundant PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRRef rref = subst[lref];
+    if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
+      irt_setmark(IR(lref)->t);
+    } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
+      /* Quick check for simple recurrences failed, need pass2. */
+      irt_setmark(IR(lref)->t);
+      pass2 = 1;
+    }
+  }
+  /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
+  if (pass2) {
+    for (i = J->cur.nins-1; i > invar; i--) {
+      IRIns *ir = IR(i);
+      if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t);
+      if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
+    }
+  }
+  /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
+  nslots = J->baseslot+J->maxslot;
+  for (i = 1; i < nslots; i++) {
+    IRRef ref = tref_ref(J->slot[i]);
+    if (!irref_isk(ref) && ref != subst[ref]) {
+      IRIns *ir = IR(ref);
+      irt_clearmark(ir->t);  /* Unmark potential uses, too. */
+      if (!irt_isphi(ir->t) && !irt_ispri(ir->t)) {
+	irt_setphi(ir->t);
+	if (nphi >= LJ_MAX_PHI)
+	  lj_trace_err(J, LJ_TRERR_PHIOV);
+	phi[nphi++] = (IRRef1)ref;
+      }
+    }
+  }
+  /* Pass #4: emit PHI instructions or eliminate PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRIns *ir = IR(lref);
+    if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
+      IRRef rref = subst[lref];
+      if (rref > invar)
+	irt_setphi(IR(rref)->t);
+      emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
+    } else {  /* Otherwise eliminate PHI. */
+      irt_clearmark(ir->t);
+      irt_clearphi(ir->t);
+    }
+  }
+}
+
+/* -- Loop unrolling using copy-substitution ------------------------------ */
+
+/* Unroll loop. */
+static void loop_unroll(jit_State *J)
+{
+  IRRef1 phi[LJ_MAX_PHI];
+  uint32_t nphi = 0;
+  IRRef1 *subst;
+  SnapShot *osnap, *snap;
+  IRRef2 *loopmap;
+  BCReg loopslots;
+  MSize nsnap, nsnapmap;
+  IRRef ins, invar, osnapref;
+
+  /* Use temp buffer for substitution table.
+  ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
+  ** Note: don't call into the VM or run the GC or the buffer may be gone.
+  */
+  invar = J->cur.nins;
+  subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
+				   (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
+  subst[REF_BASE] = REF_BASE;
+
+  /* LOOP separates the pre-roll from the loop body. */
+  emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
+
+  /* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
+  nsnap = J->cur.nsnap;
+  if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
+    MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
+    if (2*nsnap-2 > maxsnap)
+      lj_trace_err(J, LJ_TRERR_SNAPOV);
+    lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
+    J->cur.snap = J->snapbuf;
+  }
+  nsnapmap = J->cur.nsnapmap;  /* Use temp. copy to avoid undo. */
+  if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
+    J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
+					     J->sizesnapmap*sizeof(IRRef2),
+					     2*J->sizesnapmap*sizeof(IRRef2));
+    J->cur.snapmap = J->snapmapbuf;
+    J->sizesnapmap *= 2;
+  }
+
+  /* The loop snapshot is used for fallback substitutions. */
+  snap = &J->cur.snap[nsnap-1];
+  loopmap = &J->cur.snapmap[snap->mapofs];
+  loopslots = snap->nslots;
+  /* The PC of snapshot #0 and the loop snapshot must match. */
+  lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
+
+  /* Start substitution with snapshot #1 (#0 is empty for root traces). */
+  osnap = &J->cur.snap[1];
+  osnapref = osnap->ref;
+
+  /* Copy and substitute all recorded instructions and snapshots. */
+  for (ins = REF_FIRST; ins < invar; ins++) {
+    IRIns *ir;
+    IRRef op1, op2;
+
+    /* Copy-substitute snapshot. */
+    if (ins >= osnapref) {
+      IRRef2 *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
+      BCReg s, nslots;
+      uint32_t nmapofs, nframelinks;
+      if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+	nmapofs = nsnapmap;
+	snap++;  /* Add new snapshot. */
+      } else {
+	nmapofs = snap->mapofs;  /* Overwrite previous snapshot. */
+      }
+      J->guardemit.irt = 0;
+      nslots = osnap->nslots;
+      nframelinks = osnap->nframelinks;
+      snap->mapofs = (uint16_t)nmapofs;
+      snap->ref = (IRRef1)J->cur.nins;
+      snap->nslots = (uint8_t)nslots;
+      snap->nframelinks = (uint8_t)nframelinks;
+      snap->count = 0;
+      osnap++;
+      osnapref = osnap->ref;
+      nsnapmap = nmapofs + nslots + nframelinks;
+      nmap = &J->cur.snapmap[nmapofs];
+      /* Substitute snapshot slots. */
+      for (s = 0; s < nslots; s++) {
+	IRRef ref = snap_ref(omap[s]);
+	if (ref) {
+	  if (!irref_isk(ref))
+	    ref = subst[ref];
+	} else if (s < loopslots) {
+	  ref = loopmap[s];
+	}
+	nmap[s] = ref;
+      }
+      /* Copy frame links. */
+      nmap += nslots;
+      omap += nslots;
+      for (s = 0; s < nframelinks; s++)
+	nmap[s] = omap[s];
+    }
+
+    /* Substitute instruction operands. */
+    ir = IR(ins);
+    op1 = ir->op1;
+    if (!irref_isk(op1)) op1 = subst[op1];
+    op2 = ir->op2;
+    if (!irref_isk(op2)) op2 = subst[op2];
+    if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
+	op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
+      subst[ins] = (IRRef1)ins;  /* Shortcut. */
+    } else {
+      /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
+      IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
+      IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
+      subst[ins] = (IRRef1)ref;
+      if (ref != ins && ref < invar) {  /* Loop-carried dependency? */
+	IRIns *irr = IR(ref);
+	/* Potential PHI? */
+	if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
+	  irt_setphi(irr->t);
+	  if (nphi >= LJ_MAX_PHI)
+	    lj_trace_err(J, LJ_TRERR_PHIOV);
+	  phi[nphi++] = (IRRef1)ref;
+	}
+	/* Check all loop-carried dependencies for type instability. */
+	if (!irt_sametype(t, irr->t)) {
+	  if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num case. */
+	    subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0));
+	  else
+	    lj_trace_err(J, LJ_TRERR_TYPEINS);
+	}
+      }
+    }
+  }
+  if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+    J->cur.nsnapmap = (uint16_t)nsnapmap;
+    snap++;
+  } else {
+    J->cur.nsnapmap = (uint16_t)snap->mapofs;  /* Last snapshot is redundant. */
+  }
+  J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
+  lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
+
+  loop_emit_phi(J, subst, phi, nphi);
+}
+
+/* Undo any partial changes made by the loop optimization. */
+static void loop_undo(jit_State *J, IRRef ins)
+{
+  lj_ir_rollback(J, ins);
+  for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
+    IRIns *ir = IR(ins);
+    irt_clearphi(ir->t);
+    irt_clearmark(ir->t);
+  }
+}
+
+/* Protected callback for loop optimization. */
+static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  UNUSED(L); UNUSED(dummy);
+  loop_unroll((jit_State *)ud);
+  return NULL;
+}
+
+/* Loop optimization. */
+int lj_opt_loop(jit_State *J)
+{
+  IRRef nins = J->cur.nins;
+  int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J);
+  if (LJ_UNLIKELY(errcode)) {
+    lua_State *L = J->L;
+    if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) {  /* Trace error? */
+      int32_t e = lj_num2int(numV(L->top-1));
+      switch ((TraceError)e) {
+      case LJ_TRERR_TYPEINS:  /* Type instability. */
+      case LJ_TRERR_GFAIL:  /* Guard would always fail. */
+	/* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
+	if (--J->instunroll < 0)  /* But do not unroll forever. */
+	  break;
+	L->top--;  /* Remove error object. */
+	J->guardemit.irt = 0;
+	loop_undo(J, nins);
+	return 1;  /* Loop optimization failed, continue recording. */
+      default:
+	break;
+      }
+    }
+    lj_err_throw(L, errcode);  /* Propagate all other errors. */
+  }
+  return 0;  /* Loop optimization is ok. */
+}
+
+#undef IR
+#undef emitir
+#undef emitir_raw
+
+#endif

+ 550 - 0
src/lj_opt_mem.c

@@ -0,0 +1,550 @@
+/*
+** Memory access optimizations.
+** AA: Alias Analysis using high-level semantic disambiguation.
+** FWD: Load Forwarding (L2L) + Store Forwarding (S2L).
+** DSE: Dead-Store Elimination.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_mem_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_tab.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+#define fins		(&J->fold.ins)
+
+/*
+** Caveat #1: return value is not always a TRef -- only use with tref_ref().
+** Caveat #2: FWD relies on active CSE for xREF operands -- see lj_opt_fold().
+*/
+
+/* Return values from alias analysis. */
+typedef enum {
+  ALIAS_NO,	/* The two refs CANNOT alias (exact). */
+  ALIAS_MAY,	/* The two refs MAY alias (inexact). */
+  ALIAS_MUST	/* The two refs MUST alias (exact). */
+} AliasRet;
+
+/* -- ALOAD/HLOAD forwarding and ASTORE/HSTORE elimination ---------------- */
+
+/* Alias analysis for array and hash access using key-based disambiguation. */
+static AliasRet aa_ahref(jit_State *J, IRIns *refa, IRIns *refb)
+{
+  IRRef ka = refa->op2;
+  IRRef kb = refb->op2;
+  IRIns *keya, *keyb;
+  if (refa == refb)
+    return ALIAS_MUST;  /* Shortcut for same refs. */
+  keya = IR(ka);
+  if (keya->o == IR_KSLOT) { ka = keya->op1; keya = IR(ka); }
+  keyb = IR(kb);
+  if (keyb->o == IR_KSLOT) { kb = keyb->op1; keyb = IR(kb); }
+  if (ka == kb) {
+    /* Same key. Check for same table with different ref (NEWREF vs. HREF). */
+    IRIns *ta = refa;
+    IRIns *tb = refb;
+    if (ta->o == IR_HREFK || ta->o == IR_AREF) ta = IR(ta->op1);
+    if (tb->o == IR_HREFK || tb->o == IR_AREF) tb = IR(tb->op1);
+    if (ta->op1 == tb->op1)
+      return ALIAS_MUST;  /* Same key, same table. */
+    else
+      return ALIAS_MAY;  /* Same key, possibly different table. */
+  }
+  if (irref_isk(ka) && irref_isk(kb))
+    return ALIAS_NO;  /* Different constant keys. */
+  if (refa->o == IR_AREF) {
+    /* Disambiguate array references based on index arithmetic. */
+    lua_assert(refb->o == IR_AREF);
+    if (refa->op1 == refb->op1) {
+      /* Same table, different non-const array keys. */
+      int32_t ofsa = 0, ofsb = 0;
+      IRRef basea = ka, baseb = kb;
+      /* Gather base and offset from t[base] or t[base+-ofs]. */
+      if (keya->o == IR_ADD && irref_isk(keya->op2)) {
+	basea = keya->op1;
+	ofsa = IR(keya->op2)->i;
+	if (basea == kb && ofsa != 0)
+	  return ALIAS_NO;  /* t[base+-ofs] vs. t[base]. */
+      }
+      if (keyb->o == IR_ADD && irref_isk(keyb->op2)) {
+	baseb = keyb->op1;
+	ofsb = IR(keyb->op2)->i;
+	if (ka == baseb && ofsb != 0)
+	  return ALIAS_NO;  /* t[base] vs. t[base+-ofs]. */
+      }
+      if (basea == baseb && ofsa != ofsb)
+	return ALIAS_NO;  /* t[base+-o1] vs. t[base+-o2] and o1 != o2. */
+    }
+  } else {
+    /* Disambiguate hash references based on the type of their keys. */
+    lua_assert((refa->o==IR_HREF || refa->o==IR_HREFK || refa->o==IR_NEWREF) &&
+	       (refb->o==IR_HREF || refb->o==IR_HREFK || refb->o==IR_NEWREF));
+    if (!irt_sametype(keya->t, keyb->t))
+      return ALIAS_NO;  /* Different key types. */
+  }
+  return ALIAS_MAY;  /* Anything else: we just don't know. */
+}
+
+/* Array and hash load forwarding. */
+static TRef fwd_ahload(jit_State *J, IRRef xref)
+{
+  IRIns *xr = IR(xref);
+  IRRef lim = xref;  /* Search limit. */
+  IRRef ref;
+
+  /* Search for conflicting stores. */
+  ref = J->chain[fins->o+IRDELTA_L2S];
+  while (ref > xref) {
+    IRIns *store = IR(ref);
+    switch (aa_ahref(J, xr, IR(store->op1))) {
+    case ALIAS_NO:   break;  /* Continue searching. */
+    case ALIAS_MAY:  lim = ref; goto conflict;  /* Limit search for load. */
+    case ALIAS_MUST: return store->op2;  /* Store forwarding. */
+    }
+    ref = store->prev;
+  }
+
+  /* No conflicting store (yet): const-fold loads from allocations. */
+  {
+    IRIns *ir = (xr->o == IR_HREFK || xr->o == IR_AREF) ? IR(xr->op1) : xr;
+    IRRef tab = ir->op1;
+    ir = IR(tab);
+    if (ir->o == IR_TNEW || (ir->o == IR_TDUP && irref_isk(xr->op2))) {
+      /* A NEWREF with a number key may end up pointing to the array part.
+      ** But it's referenced from HSTORE and not found in the ASTORE chain.
+      ** For now simply consider this a conflict without forwarding anything.
+      */
+      if (xr->o == IR_AREF) {
+	IRRef ref2 = J->chain[IR_NEWREF];
+	while (ref2 > tab) {
+	  IRIns *newref = IR(ref2);
+	  if (irt_isnum(IR(newref->op2)->t))
+	    goto conflict;
+	  ref2 = newref->prev;
+	}
+      }
+      /* NEWREF inhibits CSE for HREF, and dependent FLOADs from HREFK/AREF.
+      ** But the above search for conflicting stores was limited by xref.
+      ** So continue searching, limited by the TNEW/TDUP. Store forwarding
+      ** is ok, too. A conflict does NOT limit the search for a matching load.
+      */
+      while (ref > tab) {
+	IRIns *store = IR(ref);
+	switch (aa_ahref(J, xr, IR(store->op1))) {
+	case ALIAS_NO:   break;  /* Continue searching. */
+	case ALIAS_MAY:  goto conflict;  /* Conflicting store. */
+	case ALIAS_MUST: return store->op2;  /* Store forwarding. */
+	}
+	ref = store->prev;
+      }
+      lua_assert(ir->o != IR_TNEW || irt_isnil(fins->t));
+      if (irt_ispri(fins->t)) {
+	return TREF_PRI(irt_type(fins->t));
+      } else if (irt_isnum(fins->t) || irt_isstr(fins->t)) {
+	TValue keyv;
+	cTValue *tv;
+	IRIns *key = IR(xr->op2);
+	if (key->o == IR_KSLOT) key = IR(key->op1);
+	lj_ir_kvalue(J->L, &keyv, key);
+	tv = lj_tab_get(J->L, ir_ktab(IR(ir->op1)), &keyv);
+	lua_assert(itype2irt(tv) == irt_type(fins->t));
+	if (irt_isnum(fins->t))
+	  return lj_ir_knum_nn(J, tv->u64);
+	else
+	  return lj_ir_kstr(J, strV(tv));
+      }
+      /* Othwerwise: don't intern as a constant. */
+    }
+  }
+
+conflict:
+  /* Try to find a matching load. Below the conflicting store, if any. */
+  ref = J->chain[fins->o];
+  while (ref > lim) {
+    IRIns *load = IR(ref);
+    if (load->op1 == xref)
+      return ref;  /* Load forwarding. */
+    ref = load->prev;
+  }
+  return 0;  /* Conflict or no match. */
+}
+
+/* Reassociate ALOAD across PHIs to handle t[i-1] forwarding case. */
+static TRef fwd_aload_reassoc(jit_State *J)
+{
+  IRIns *irx = IR(fins->op1);
+  IRIns *key = IR(irx->op2);
+  if (key->o == IR_ADD && irref_isk(key->op2)) {
+    IRIns *add2 = IR(key->op1);
+    if (add2->o == IR_ADD && irref_isk(add2->op2) &&
+	IR(key->op2)->i == -IR(add2->op2)->i) {
+      IRRef ref = J->chain[IR_AREF];
+      IRRef lim = add2->op1;
+      if (irx->op1 > lim) lim = irx->op1;
+      while (ref > lim) {
+	IRIns *ir = IR(ref);
+	if (ir->op1 == irx->op1 && ir->op2 == add2->op1)
+	  return fwd_ahload(J, ref);
+	ref = ir->prev;
+      }
+    }
+  }
+  return 0;
+}
+
+/* ALOAD forwarding. */
+TRef LJ_FASTCALL lj_opt_fwd_aload(jit_State *J)
+{
+  IRRef ref;
+  if ((ref = fwd_ahload(J, fins->op1)) ||
+      (ref = fwd_aload_reassoc(J)))
+    return ref;
+  return EMITFOLD;
+}
+
+/* HLOAD forwarding. */
+TRef LJ_FASTCALL lj_opt_fwd_hload(jit_State *J)
+{
+  IRRef ref = fwd_ahload(J, fins->op1);
+  if (ref)
+    return ref;
+  return EMITFOLD;
+}
+
+/* ASTORE/HSTORE elimination. */
+TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_State *J)
+{
+  IRRef xref = fins->op1;  /* xREF reference. */
+  IRRef val = fins->op2;  /* Stored value reference. */
+  IRIns *xr = IR(xref);
+  IRRef1 *refp = &J->chain[fins->o];
+  IRRef ref = *refp;
+  while (ref > xref) {  /* Search for redundant or conflicting stores. */
+    IRIns *store = IR(ref);
+    switch (aa_ahref(J, xr, IR(store->op1))) {
+    case ALIAS_NO:
+      break;  /* Continue searching. */
+    case ALIAS_MAY:	/* Store to MAYBE the same location. */
+      if (store->op2 != val)  /* Conflict if the value is different. */
+	goto doemit;
+      break;  /* Otherwise continue searching. */
+    case ALIAS_MUST:	/* Store to the same location. */
+      if (store->op2 == val)  /* Same value: drop the new store. */
+	return DROPFOLD;
+      /* Different value: try to eliminate the redundant store. */
+      if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
+	IRIns *ir;
+	/* Check for any intervening guards (includes conflicting loads). */
+	for (ir = IR(J->cur.nins-1); ir > store; ir--)
+	  if (irt_isguard(ir->t))
+	    goto doemit;  /* No elimination possible. */
+	/* Remove redundant store from chain and replace with NOP. */
+	*refp = store->prev;
+	store->o = IR_NOP;  /* Unchained NOP -- does anybody care? */
+	store->t.irt = IRT_NIL;
+	store->op1 = store->op2 = 0;
+	store->prev = 0;
+	/* Now emit the new store instead. */
+      }
+      goto doemit;
+    }
+    ref = *(refp = &store->prev);
+  }
+doemit:
+  return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+}
+
+/* -- ULOAD forwarding ---------------------------------------------------- */
+
+/* The current alias analysis for upvalues is very simplistic. It only
+** disambiguates between the unique upvalues of the same function.
+** This is good enough for now, since most upvalues are read-only.
+**
+** A more precise analysis would be feasible with the help of the parser:
+** generate a unique key for every upvalue, even across all prototypes.
+** Lacking a realistic use-case, it's unclear whether this is beneficial.
+*/
+static AliasRet aa_uref(IRIns *refa, IRIns *refb)
+{
+  if (refa->o != refb->o)
+    return ALIAS_NO;  /* Different UREFx type. */
+  if (refa->op1 != refb->op1)
+    return ALIAS_MAY;  /* Different function. */
+  else if (refa->op2 == refb->op2)
+    return ALIAS_MUST;  /* Same function, same upvalue idx. */
+  else
+    return ALIAS_NO;  /* Same function, different upvalue idx. */
+}
+
+/* ULOAD forwarding. */
+TRef LJ_FASTCALL lj_opt_fwd_uload(jit_State *J)
+{
+  IRRef uref = fins->op1;
+  IRRef lim = uref;  /* Search limit. */
+  IRIns *xr = IR(uref);
+  IRRef ref;
+
+  /* Search for conflicting stores. */
+  ref = J->chain[IR_USTORE];
+  while (ref > uref) {
+    IRIns *store = IR(ref);
+    switch (aa_uref(xr, IR(store->op1))) {
+    case ALIAS_NO:   break;  /* Continue searching. */
+    case ALIAS_MAY:  lim = ref; goto conflict;  /* Limit search for load. */
+    case ALIAS_MUST: return store->op2;  /* Store forwarding. */
+    }
+    ref = store->prev;
+  }
+
+conflict:
+  /* Try to find a matching load. Below the conflicting store, if any. */
+  ref = J->chain[IR_ULOAD];
+  while (ref > lim) {
+    IRIns *load = IR(ref);
+    if (load->op1 == uref)
+      return ref;  /* Load forwarding. */
+    ref = load->prev;
+  }
+  return EMITFOLD;  /* Conflict or no match. */
+}
+
+/* USTORE elimination. */
+TRef LJ_FASTCALL lj_opt_dse_ustore(jit_State *J)
+{
+  IRRef xref = fins->op1;  /* xREF reference. */
+  IRRef val = fins->op2;  /* Stored value reference. */
+  IRIns *xr = IR(xref);
+  IRRef1 *refp = &J->chain[IR_USTORE];
+  IRRef ref = *refp;
+  while (ref > xref) {  /* Search for redundant or conflicting stores. */
+    IRIns *store = IR(ref);
+    switch (aa_uref(xr, IR(store->op1))) {
+    case ALIAS_NO:
+      break;  /* Continue searching. */
+    case ALIAS_MAY:	/* Store to MAYBE the same location. */
+      if (store->op2 != val)  /* Conflict if the value is different. */
+	goto doemit;
+      break;  /* Otherwise continue searching. */
+    case ALIAS_MUST:	/* Store to the same location. */
+      if (store->op2 == val)  /* Same value: drop the new store. */
+	return DROPFOLD;
+      /* Different value: try to eliminate the redundant store. */
+      if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
+	IRIns *ir;
+	/* Check for any intervening guards (includes conflicting loads). */
+	for (ir = IR(J->cur.nins-1); ir > store; ir--)
+	  if (irt_isguard(ir->t))
+	    goto doemit;  /* No elimination possible. */
+	/* Remove redundant store from chain and replace with NOP. */
+	*refp = store->prev;
+	store->o = IR_NOP;  /* Unchained NOP -- does anybody care? */
+	store->t.irt = IRT_NIL;
+	store->op1 = store->op2 = 0;
+	store->prev = 0;
+	/* Now emit the new store instead. */
+      }
+      goto doemit;
+    }
+    ref = *(refp = &store->prev);
+  }
+doemit:
+  return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+}
+
+/* -- FLOAD forwarding and FSTORE elimination ----------------------------- */
+
+/* Alias analysis for field access.
+** Field loads are cheap and field stores are rare.
+** Simple disambiguation based on field types is good enough.
+*/
+static AliasRet aa_fref(IRIns *refa, IRIns *refb)
+{
+  if (refa->op2 != refb->op2)
+    return ALIAS_NO;  /* Different fields. */
+  if (refa->op1 == refb->op1)
+    return ALIAS_MUST;  /* Same field, same object. */
+  else
+    return ALIAS_MAY;  /* Same field, possibly different object. */
+}
+
+/* Only the loads for mutable fields end up here (see FOLD). */
+TRef LJ_FASTCALL lj_opt_fwd_fload(jit_State *J)
+{
+  IRRef oref = fins->op1;  /* Object reference. */
+  IRRef fid = fins->op2;  /* Field ID. */
+  IRRef lim = oref;  /* Search limit. */
+  IRRef ref;
+
+  /* Search for conflicting stores. */
+  ref = J->chain[IR_FSTORE];
+  while (ref > oref) {
+    IRIns *store = IR(ref);
+    switch (aa_fref(fins, IR(store->op1))) {
+    case ALIAS_NO:   break;  /* Continue searching. */
+    case ALIAS_MAY:  lim = ref; goto conflict;  /* Limit search for load. */
+    case ALIAS_MUST: return store->op2;  /* Store forwarding. */
+    }
+    ref = store->prev;
+  }
+
+  /* No conflicting store: const-fold field loads from allocations. */
+  if (fid == IRFL_TAB_META) {
+    IRIns *ir = IR(oref);
+    if (ir->o == IR_TNEW || ir->o == IR_TDUP)
+      return lj_ir_knull(J, IRT_TAB);
+  }
+
+conflict:
+  /* Try to find a matching load. Below the conflicting store, if any. */
+  ref = J->chain[IR_FLOAD];
+  while (ref > lim) {
+    IRIns *load = IR(ref);
+    if (load->op1 == oref && load->op2 == fid)
+      return ref;  /* Load forwarding. */
+    ref = load->prev;
+  }
+  return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+}
+
+/* FSTORE elimination. */
+TRef LJ_FASTCALL lj_opt_dse_fstore(jit_State *J)
+{
+  IRRef fref = fins->op1;  /* FREF reference. */
+  IRRef val = fins->op2;  /* Stored value reference. */
+  IRIns *xr = IR(fref);
+  IRRef1 *refp = &J->chain[IR_FSTORE];
+  IRRef ref = *refp;
+  while (ref > fref) {  /* Search for redundant or conflicting stores. */
+    IRIns *store = IR(ref);
+    switch (aa_fref(xr, IR(store->op1))) {
+    case ALIAS_NO:
+      break;  /* Continue searching. */
+    case ALIAS_MAY:
+      if (store->op2 != val)  /* Conflict if the value is different. */
+	goto doemit;
+      break;  /* Otherwise continue searching. */
+    case ALIAS_MUST:
+      if (store->op2 == val)  /* Same value: drop the new store. */
+	return DROPFOLD;
+      /* Different value: try to eliminate the redundant store. */
+      if (ref > J->chain[IR_LOOP]) {  /* Quick check to avoid crossing LOOP. */
+	IRIns *ir;
+	/* Check for any intervening guards or conflicting loads. */
+	for (ir = IR(J->cur.nins-1); ir > store; ir--)
+	  if (irt_isguard(ir->t) || (ir->o == IR_FLOAD && ir->op2 == xr->op2))
+	    goto doemit;  /* No elimination possible. */
+	/* Remove redundant store from chain and replace with NOP. */
+	*refp = store->prev;
+	store->o = IR_NOP;  /* Unchained NOP -- does anybody care? */
+	store->t.irt = IRT_NIL;
+	store->op1 = store->op2 = 0;
+	store->prev = 0;
+	/* Now emit the new store instead. */
+      }
+      goto doemit;
+    }
+    ref = *(refp = &store->prev);
+  }
+doemit:
+  return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+}
+
+/* -- TLEN forwarding ----------------------------------------------------- */
+
+/* This is rather simplistic right now, but better than nothing. */
+TRef LJ_FASTCALL lj_opt_fwd_tlen(jit_State *J)
+{
+  IRRef tab = fins->op1;  /* Table reference. */
+  IRRef lim = tab;  /* Search limit. */
+  IRRef ref;
+
+  /* Any ASTORE is a conflict and limits the search. */
+  if (J->chain[IR_ASTORE] > lim) lim = J->chain[IR_ASTORE];
+
+  /* Search for conflicting HSTORE with numeric key. */
+  ref = J->chain[IR_HSTORE];
+  while (ref > lim) {
+    IRIns *store = IR(ref);
+    IRIns *href = IR(store->op1);
+    IRIns *key = IR(href->op2);
+    if (irt_isnum(key->o == IR_KSLOT ? IR(key->op1)->t : key->t)) {
+      lim = ref;  /* Conflicting store found, limits search for TLEN. */
+      break;
+    }
+    ref = store->prev;
+  }
+
+  /* Try to find a matching load. Below the conflicting store, if any. */
+  ref = J->chain[IR_TLEN];
+  while (ref > lim) {
+    IRIns *tlen = IR(ref);
+    if (tlen->op1 == tab)
+      return ref;  /* Load forwarding. */
+    ref = tlen->prev;
+  }
+  return EMITFOLD;  /* Otherwise we have a conflict or simply no match. */
+}
+
+/* -- ASTORE/HSTORE previous type analysis -------------------------------- */
+
+/* Check whether the previous value for a table store is non-nil.
+** This can be derived either from a previous store or from a previous
+** load (because all loads from tables perform a type check).
+**
+** The result of the analysis can be used to avoid the metatable check
+** and the guard against HREF returning niltv. Both of these are cheap,
+** so let's not spend too much effort on the analysis.
+**
+** A result of 1 is exact: previous value CANNOT be nil.
+** A result of 0 is inexact: previous value MAY be nil.
+*/
+int lj_opt_fwd_wasnonnil(jit_State *J, IROpT loadop, IRRef xref)
+{
+  /* First check stores. */
+  IRRef ref = J->chain[loadop+IRDELTA_L2S];
+  while (ref > xref) {
+    IRIns *store = IR(ref);
+    if (store->op1 == xref) {  /* Same xREF. */
+      /* A nil store MAY alias, but a non-nil store MUST alias. */
+      return !irt_isnil(store->t);
+    } else if (irt_isnil(store->t)) {  /* Must check any nil store. */
+      IRRef skref = IR(store->op1)->op2;
+      IRRef xkref = IR(xref)->op2;
+      /* Same key type MAY alias. */
+      if (irt_sametype(IR(skref)->t, IR(xkref)->t)) {
+	if (skref == xkref || !irref_isk(skref) || !irref_isk(xkref))
+	  return 0;  /* A nil store with same const key or var key MAY alias. */
+	/* Different const keys CANNOT alias. */
+      }  /* Different key types CANNOT alias. */
+    }  /* Other non-nil stores MAY alias. */
+    ref = store->prev;
+  }
+
+  /* Check loads since nothing could be derived from stores. */
+  ref = J->chain[loadop];
+  while (ref > xref) {
+    IRIns *load = IR(ref);
+    if (load->op1 == xref) {  /* Same xREF. */
+      /* A nil load MAY alias, but a non-nil load MUST alias. */
+      return !irt_isnil(load->t);
+    }  /* Other non-nil loads MAY alias. */
+    ref = load->prev;
+  }
+  return 0;  /* Nothing derived at all, previous value MAY be nil. */
+}
+
+/* ------------------------------------------------------------------------ */
+
+#undef IR
+#undef fins
+
+#endif

+ 430 - 0
src/lj_opt_narrow.c

@@ -0,0 +1,430 @@
+/*
+** NARROW: Narrowing of numbers to integers (double to int32_t).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_opt_narrow_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_str.h"
+#include "lj_bc.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+
+/* Rationale for narrowing optimizations:
+**
+** Lua has only a single number type and this is a FP double by default.
+** Narrowing doubles to integers does not pay off for the interpreter on a
+** current-generation x86/x64 machine. Most FP operations need the same
+** amount of execution resources as their integer counterparts, except
+** with slightly longer latencies. Longer latencies are a non-issue for
+** the interpreter, since they are usually hidden by other overhead.
+**
+** The total CPU execution bandwidth is the sum of the bandwidth of the FP
+** and the integer units, because they execute in parallel. The FP units
+** have an equal or higher bandwidth than the integer units. Not using
+** them means losing execution bandwidth. Moving work away from them to
+** the already quite busy integer units is a losing proposition.
+**
+** The situation for JIT-compiled code is a bit different: the higher code
+** density makes the extra latencies much more visible. Tight loops expose
+** the latencies for updating the induction variables. Array indexing
+** requires narrowing conversions with high latencies and additional
+** guards (to check that the index is really an integer). And many common
+** optimizations only work on integers.
+**
+** One solution would be speculative, eager narrowing of all number loads.
+** This causes many problems, like losing -0 or the need to resolve type
+** mismatches between traces. It also effectively forces the integer type
+** to have overflow-checking semantics. This impedes many basic
+** optimizations and requires adding overflow checks to all integer
+** arithmetic operations (whereas FP arithmetics can do without).
+**
+** Always replacing an FP op with an integer op plus an overflow check is
+** counter-productive on a current-generation super-scalar CPU. Although
+** the overflow check branches are highly predictable, they will clog the
+** execution port for the branch unit and tie up reorder buffers. This is
+** turning a pure data-flow dependency into a different data-flow
+** dependency (with slightly lower latency) *plus* a control dependency.
+** In general, you don't want to do this since latencies due to data-flow
+** dependencies can be well hidden by out-of-order execution.
+**
+** A better solution is to keep all numbers as FP values and only narrow
+** when it's beneficial to do so. LuaJIT uses predictive narrowing for
+** induction variables and demand-driven narrowing for index expressions
+** and bit operations. Additionally it can eliminate or hoists most of the
+** resulting overflow checks. Regular arithmetic computations are never
+** narrowed to integers.
+**
+** The integer type in the IR has convenient wrap-around semantics and
+** ignores overflow. Extra operations have been added for
+** overflow-checking arithmetic (ADDOV/SUBOV) instead of an extra type.
+** Apart from reducing overall complexity of the compiler, this also
+** nicely solves the problem where you want to apply algebraic
+** simplifications to ADD, but not to ADDOV. And the assembler can use lea
+** instead of an add for integer ADD, but not for ADDOV (lea does not
+** affect the flags, but it helps to avoid register moves).
+**
+** Note that all of the above has to be reconsidered if LuaJIT is to be
+** ported to architectures with slow FP operations or with no hardware FPU
+** at all. In the latter case an integer-only port may be the best overall
+** solution (if this still meets user demands).
+*/
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)			(&J->cur.ir[(ref)])
+#define fins			(&J->fold.ins)
+
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
+#define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+
+/* -- Elimination of narrowing type conversions --------------------------- */
+
+/* Narrowing of index expressions and bit operations is demand-driven. The
+** trace recorder emits a narrowing type conversion (TOINT or TOBIT) in
+** all of these cases (e.g. array indexing or string indexing). FOLD
+** already takes care of eliminating simple redundant conversions like
+** TOINT(TONUM(x)) ==> x.
+**
+** But the surrounding code is FP-heavy and all arithmetic operations are
+** performed on FP numbers. Consider a common example such as 'x=t[i+1]',
+** with 'i' already an integer (due to induction variable narrowing). The
+** index expression would be recorded as TOINT(ADD(TONUM(i), 1)), which is
+** clearly suboptimal.
+**
+** One can do better by recursively backpropagating the narrowing type
+** conversion across FP arithmetic operations. This turns FP ops into
+** their corresponding integer counterparts. Depending on the semantics of
+** the conversion they also need to check for overflow. Currently only ADD
+** and SUB are supported.
+**
+** The above example can be rewritten as ADDOV(TOINT(TONUM(i)), 1) and
+** then into ADDOV(i, 1) after folding of the conversions. The original FP
+** ops remain in the IR and are eliminated by DCE since all references to
+** them are gone.
+**
+** Special care has to be taken to avoid narrowing across an operation
+** which is potentially operating on non-integral operands. One obvious
+** case is when an expression contains a non-integral constant, but ends
+** up as an integer index at runtime (like t[x+1.5] with x=0.5).
+**
+** Operations with two non-constant operands illustrate a similar problem
+** (like t[a+b] with a=1.5 and b=2.5). Backpropagation has to stop there,
+** unless it can be proven that either operand is integral (e.g. by CSEing
+** a previous conversion). As a not-so-obvious corollary this logic also
+** applies for a whole expression tree (e.g. t[(a+1)+(b+1)]).
+**
+** Correctness of the transformation is guaranteed by avoiding to expand
+** the tree by adding more conversions than the one we would need to emit
+** if not backpropagating. TOBIT employs a more optimistic rule, because
+** the conversion has special semantics, designed to make the life of the
+** compiler writer easier. ;-)
+**
+** Using on-the-fly backpropagation of an expression tree doesn't work
+** because it's unknown whether the transform is correct until the end.
+** This either requires IR rollback and cache invalidation for every
+** subtree or a two-pass algorithm. The former didn't work out too well,
+** so the code now combines a recursive collector with a stack-based
+** emitter.
+**
+** [A recursive backpropagation algorithm with backtracking, employing
+** skip-list lookup and round-robin caching, emitting stack operations
+** on-the-fly for a stack-based interpreter -- and all of that in a meager
+** kilobyte? Yep, compilers are a great treasure chest. Throw away your
+** textbooks and read the codebase of a compiler today!]
+**
+** There's another optimization opportunity for array indexing: it's
+** always accompanied by an array bounds-check. The outermost overflow
+** check may be delegated to the ABC operation. This works because ABC is
+** an unsigned comparison and wrap-around due to overflow creates negative
+** numbers.
+**
+** But this optimization is only valid for constants that cannot overflow
+** an int32_t into the range of valid array indexes [0..2^27+1). A check
+** for +-2^30 is safe since -2^31 - 2^30 wraps to 2^30 and 2^31-1 + 2^30
+** wraps to -2^30-1.
+**
+** It's also good enough in practice, since e.g. t[i+1] or t[i-10] are
+** quite common. So the above example finally ends up as ADD(i, 1)!
+**
+** Later on, the assembler is able to fuse the whole array reference and
+** the ADD into the memory operands of loads and other instructions. This
+** is why LuaJIT is able to generate very pretty (and fast) machine code
+** for array indexing. And that, my dear, concludes another story about
+** one of the hidden secrets of LuaJIT ...
+*/
+
+/* Maximum backpropagation depth and maximum stack size. */
+#define NARROW_MAX_BACKPROP	100
+#define NARROW_MAX_STACK	256
+
+/* Context used for narrowing of type conversions. */
+typedef struct NarrowConv {
+  jit_State *J;		/* JIT compiler state. */
+  IRRef2 *sp;		/* Current stack pointer. */
+  IRRef2 *maxsp;	/* Maximum stack pointer minus redzone. */
+  int lim;		/* Limit on the number of emitted conversions. */
+  IRRef mode;		/* Conversion mode (IRTOINT_*). */
+  IRRef2 stack[NARROW_MAX_STACK];  /* Stack holding the stack-machine code. */
+} NarrowConv;
+
+/* The stack machine has a 32 bit instruction format: [IROpT | IRRef1]
+** The lower 16 bits hold a reference (or 0). The upper 16 bits hold
+** the IR opcode + type or one of the following special opcodes:
+*/
+enum {
+  NARROW_REF,		/* Push ref. */
+  NARROW_CONV,		/* Push conversion of ref. */
+  NARROW_INT		/* Push KINT ref. The next code holds an int32_t. */
+};
+
+/* Lookup a reference in the backpropagation cache. */
+static IRRef narrow_bpc_get(jit_State *J, IRRef1 key, IRRef mode)
+{
+  ptrdiff_t i;
+  for (i = 0; i < BPROP_SLOTS; i++) {
+    BPropEntry *bp = &J->bpropcache[i];
+    if (bp->key == key && bp->mode <= mode)  /* Stronger checks are ok, too. */
+      return bp->val;
+  }
+  return 0;
+}
+
+/* Add an entry to the backpropagation cache. */
+static void narrow_bpc_set(jit_State *J, IRRef1 key, IRRef1 val, IRRef mode)
+{
+  uint32_t slot = J->bpropslot;
+  BPropEntry *bp = &J->bpropcache[slot];
+  J->bpropslot = (slot + 1) & (BPROP_SLOTS-1);
+  bp->key = key;
+  bp->val = val;
+  bp->mode = mode;
+}
+
+/* Backpropagate narrowing conversion. Return number of needed conversions. */
+static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
+{
+  jit_State *J = nc->J;
+  IRIns *ir = IR(ref);
+  IRRef cref;
+
+  /* Check the easy cases first. */
+  if (ir->o == IR_TONUM) {  /* Undo inverse conversion. */
+    *nc->sp++ = IRREF2(ir->op1, NARROW_REF);
+    return 0;
+  } else if (ir->o == IR_KNUM) {  /* Narrow FP constant. */
+    lua_Number n = ir_knum(ir)->n;
+    if (nc->mode == IRTOINT_TOBIT) {  /* Allows a wider range of constants. */
+      int64_t k64 = (int64_t)n;
+      if (n == cast_num(k64)) {  /* Only if constant doesn't lose precision. */
+	*nc->sp++ = IRREF2(0, NARROW_INT);
+	*nc->sp++ = (IRRef2)k64;  /* But always truncate to 32 bits. */
+	return 0;
+      }
+    } else {
+      int32_t k = lj_num2int(n);
+      if (n == cast_num(k)) {  /* Only if constant is really an integer. */
+	*nc->sp++ = IRREF2(0, NARROW_INT);
+	*nc->sp++ = (IRRef2)k;
+	return 0;
+      }
+    }
+    return 10;  /* Never narrow other FP constants (this is rare). */
+  }
+
+  /* Try to CSE the conversion. Stronger checks are ok, too. */
+  for (cref = J->chain[fins->o]; cref > ref; cref = IR(cref)->prev)
+    if (IR(cref)->op1 == ref &&
+	irt_isguard(IR(cref)->t) >= irt_isguard(fins->t)) {
+      *nc->sp++ = IRREF2(cref, NARROW_REF);
+      return 0;  /* Already there, no additional conversion needed. */
+    }
+
+  /* Backpropagate across ADD/SUB. */
+  if (ir->o == IR_ADD || ir->o == IR_SUB) {
+    /* Try cache lookup first. */
+    IRRef bpref, mode = nc->mode;
+    if (mode == IRTOINT_INDEX && depth > 0)
+      mode = IRTOINT_CHECK;  /* Inner conversions need a stronger check. */
+    bpref = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
+    if (bpref) {
+      *nc->sp++ = IRREF2(bpref, NARROW_REF);
+      return 0;
+    }
+    if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
+      IRRef2 *savesp = nc->sp;
+      int count = narrow_conv_backprop(nc, ir->op1, depth);
+      count += narrow_conv_backprop(nc, ir->op2, depth);
+      if (count <= nc->lim) {  /* Limit total number of conversions. */
+	*nc->sp++ = IRREF2(ref, IRTI(ir->o));
+	return count;
+      }
+      nc->sp = savesp;  /* Too many conversions, need to backtrack. */
+    }
+  }
+
+  /* Otherwise add a conversion. */
+  *nc->sp++ = IRREF2(ref, NARROW_CONV);
+  return 1;
+}
+
+/* Emit the conversions collected during backpropagation. */
+static IRRef narrow_conv_emit(jit_State *J, NarrowConv *nc)
+{
+  /* The fins fields must be saved now -- emitir() overwrites them. */
+  IROpT guardot = irt_isguard(fins->t) ? IRTG(IR_ADDOV-IR_ADD, 0) : 0;
+  IROpT convot = fins->ot;
+  IRRef1 convop2 = fins->op2;
+  IRRef2 *next = nc->stack;  /* List of instructions from backpropagation. */
+  IRRef2 *last = nc->sp;
+  IRRef2 *sp = nc->stack;  /* Recycle the stack to store operands. */
+  while (next < last) {  /* Simple stack machine to process the ins. list. */
+    IRRef2 ref = *next++;
+    IROpT op = ref >> 16;
+    if (op == NARROW_REF) {
+      *sp++ = ref;
+    } else if (op == NARROW_CONV) {
+      *sp++ = emitir_raw(convot, ref, convop2);  /* Raw emit avoids a loop. */
+    } else if (op == NARROW_INT) {
+      lua_assert(next < last);
+      *sp++ = lj_ir_kint(J, *next++);
+    } else {  /* Regular IROpT. Pops two operands and pushes one result. */
+      IRRef mode = nc->mode;
+      lua_assert(sp >= nc->stack+2);
+      sp--;
+      /* Omit some overflow checks for array indexing. See comments above. */
+      if (mode == IRTOINT_INDEX) {
+	if (next == last && irref_isk((IRRef1)sp[0]) &&
+	  (uint32_t)IR((IRRef1)sp[0])->i + 0x40000000 < 0x80000000)
+	  guardot = 0;
+	else
+	  mode = IRTOINT_CHECK;  /* Otherwise cache a stronger check. */
+      }
+      sp[-1] = emitir(op+guardot, sp[-1], sp[0]);
+      narrow_bpc_set(J, (IRRef1)ref, (IRRef1)sp[-1], mode);  /* Add to cache. */
+    }
+  }
+  lua_assert(sp == nc->stack+1);
+  return nc->stack[0];
+}
+
+/* Narrow a type conversion of an arithmetic operation. */
+TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
+{
+  if ((J->flags & JIT_F_OPT_NARROW)) {
+    NarrowConv nc;
+    nc.J = J;
+    nc.sp = nc.stack;
+    nc.maxsp = &nc.stack[NARROW_MAX_STACK-4];
+    if (fins->o == IR_TOBIT) {
+      nc.mode = IRTOINT_TOBIT;  /* Used only in the backpropagation cache. */
+      nc.lim = 2;  /* TOBIT can use a more optimistic rule. */
+    } else {
+      nc.mode = fins->op2;
+      nc.lim = 1;
+    }
+    if (narrow_conv_backprop(&nc, fins->op1, 0) <= nc.lim)
+      return narrow_conv_emit(J, &nc);
+  }
+  return NEXTFOLD;
+}
+
+/* -- Narrowing of arithmetic operators ----------------------------------- */
+
+/* Check whether a number fits into an int32_t (-0 is ok, too). */
+static int numisint(lua_Number n)
+{
+  return (n == cast_num(lj_num2int(n)));
+}
+
+/* Narrowing of modulo operator. */
+TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc)
+{
+  TRef tmp;
+  if ((J->flags & JIT_F_OPT_NARROW) &&
+      tref_isk(rc) && tref_isint(rc)) {  /* Optimize x % k. */
+    int32_t k = IR(tref_ref(rc))->i;
+    if (k > 0 && (k & (k-1)) == 0) {  /* i % 2^k ==> band(i, 2^k-1) */
+      if (tref_isint(rb))
+	return emitir(IRTI(IR_BAND), rb, lj_ir_kint(J, k-1));
+    }
+  }
+  /* b % c ==> b - floor(b/c)*c */
+  rb = lj_ir_tonum(J, rb);
+  rc = lj_ir_tonum(J, rc);
+  tmp = emitir(IRTN(IR_DIV), rb, rc);
+  tmp = emitir(IRTN(IR_FPMATH), tmp, IRFPM_FLOOR);
+  tmp = emitir(IRTN(IR_MUL), tmp, rc);
+  return emitir(IRTN(IR_SUB), rb, tmp);
+}
+
+/* Narrowing of power operator or math.pow. */
+TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc)
+{
+  lua_Number n;
+  if (tvisstr(vc) && !lj_str_numconv(strVdata(vc), vc))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  n = numV(vc);
+  /* Limit narrowing for pow to small exponents (or for two constants). */
+  if ((tref_isint(rc) && tref_isk(rc) && tref_isk(rb)) ||
+      ((J->flags & JIT_F_OPT_NARROW) &&
+       (numisint(n) && n >= -65536.0 && n <= 65536.0))) {
+    TRef tmp;
+    if (!tref_isinteger(rc)) {
+      if (tref_isstr(rc))
+	rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0);
+      rc = emitir(IRTGI(IR_TOINT), rc, IRTOINT_CHECK); /* Guarded TOINT! */
+    }
+    if (!tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
+      tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536-2147483647-1));
+      emitir(IRTGI(IR_LE), tmp, lj_ir_kint(J, 2*65536-2147483647-1));
+    }
+    return emitir(IRTN(IR_POWI), rb, rc);
+  }
+  /* FOLD covers most cases, but some are easier to do here. */
+  if (tref_isk(rb) && tvispone(ir_knum(IR(tref_ref(rb)))))
+    return rb;  /* 1 ^ x ==> 1 */
+  rc = lj_ir_tonum(J, rc);
+  if (tref_isk(rc) && ir_knum(IR(tref_ref(rc)))->n == 0.5)
+    return emitir(IRTN(IR_FPMATH), rb, IRFPM_SQRT);  /* x ^ 0.5 ==> sqrt(x) */
+  /* Split up b^c into exp2(c*log2(b)). Assembler may rejoin later. */
+  rb = emitir(IRTN(IR_FPMATH), rb, IRFPM_LOG2);
+  rc = emitir(IRTN(IR_MUL), rb, rc);
+  return emitir(IRTN(IR_FPMATH), rc, IRFPM_EXP2);
+}
+
+/* -- Predictive narrowing of induction variables ------------------------- */
+
+/* Narrow the FORL index type by looking at the runtime values. */
+IRType lj_opt_narrow_forl(cTValue *forbase)
+{
+  lua_assert(tvisnum(&forbase[FORL_IDX]) &&
+	     tvisnum(&forbase[FORL_STOP]) &&
+	     tvisnum(&forbase[FORL_STEP]));
+  /* Narrow only if the runtime values of start/stop/step are all integers. */
+  if (numisint(numV(&forbase[FORL_IDX])) &&
+      numisint(numV(&forbase[FORL_STOP])) &&
+      numisint(numV(&forbase[FORL_STEP]))) {
+    /* And if the loop index can't possibly overflow. */
+    lua_Number step = numV(&forbase[FORL_STEP]);
+    lua_Number sum = numV(&forbase[FORL_STOP]) + step;
+    if (0 <= step ? sum <= 2147483647.0 : sum >= -2147483648.0)
+      return IRT_INT;
+  }
+  return IRT_NUM;
+}
+
+#undef IR
+#undef fins
+#undef emitir
+#undef emitir_raw
+
+#endif

+ 2198 - 0
src/lj_parse.c

@@ -0,0 +1,2198 @@
+/*
+** Lua parser (source code -> bytecode).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Major portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_parse_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_state.h"
+#include "lj_bc.h"
+#include "lj_lex.h"
+#include "lj_parse.h"
+#include "lj_vm.h"
+#include "lj_vmevent.h"
+
+/* -- Parser structures and definitions ----------------------------------- */
+
+/* Expression kinds. */
+typedef enum {
+  /* Constant expressions must be first and in this order: */
+  VKNIL,
+  VKFALSE,
+  VKTRUE,
+  VKSTR,	/* sval = string value */
+  VKNUM,	/* nval = numerical value */
+  VKLAST = VKNUM,
+  /* Non-constant expressions follow: */
+  VLOCAL,	/* info = local register */
+  VUPVAL,	/* info = upvalue index */
+  VGLOBAL,	/* sval = string value */
+  VINDEXED,	/* info = table register, aux = index reg/byte/string const */
+  VJMP,		/* info = instruction PC */
+  VRELOCABLE,	/* info = instruction PC */
+  VNONRELOC,	/* info = result register */
+  VCALL,	/* info = instruction PC, aux = base */
+  VVOID
+} ExpKind;
+
+/* Expression descriptor. */
+typedef struct ExpDesc {
+  union {
+    struct { uint32_t info, aux; } s;
+    TValue nval;
+    GCstr *sval;
+  } u;
+  ExpKind k;
+  BCPos t;  /* true condition exit list */
+  BCPos f;  /* false condition exit list */
+} ExpDesc;
+
+/* Tests for expression types */
+#define isK(e)		((uint32_t)((e)->k) <= VKLAST)
+#define isnumK(e)	((e)->k == VKNUM)
+#define isstrK(e)	((e)->k == VKSTR)
+#define expnumV(e)	check_exp(isnumK((e)), numV(&(e)->u.nval))
+
+#define hasjumps(e)	((e)->t != (e)->f)
+#define isKexp(e)	(isK(e) && !hasjumps(e))
+#define isnumKexp(e)	(isnumK(e) && !hasjumps(e))
+
+#define priKk(k)	check_exp((k) <= VKTRUE, (k) - VKNIL)
+#define priK(e)		priKk((e)->k)
+
+/* Per-function linked list of blocks. */
+typedef struct FuncBlock {
+  struct FuncBlock *previous;  /* chain */
+  BCPos breaklist;  /* list of jumps out of this loop */
+  uint8_t nactvar;  /* # active locals outside the breakable structure */
+  uint8_t upval;  /* true if some variable in the block is an upvalue */
+  uint8_t isbreakable;  /* true if `block' is a loop */
+} FuncBlock;
+
+typedef struct UpValDesc {
+  uint8_t k;
+  uint8_t info;
+} UpValDesc;
+
+/* Per-function state. */
+typedef struct FuncState {
+  GCproto *pt;  /* current function header */
+  GCtab *kt;  /* table to find (and reuse) elements in `k' */
+  struct FuncState *prev;  /* enclosing function */
+  struct LexState *ls;  /* lexical state */
+  struct lua_State *L;  /* copy of the Lua state */
+  struct FuncBlock *bl;  /* chain of current blocks */
+  BCPos pc;  /* next bytecode position */
+  BCPos lasttarget;  /* PC of last jump target */
+  BCPos jpc;  /* list of pending jumps to PC */
+  BCReg freereg;  /* first free register */
+  BCReg nkn, nkgc;  /* number of lua_Number/GCobj constants */
+  uint16_t nlocvars;  /* number of elements in `locvars' */
+  uint8_t nactvar;  /* number of active local variables */
+  uint8_t nuv;  /* number of upvalues */
+  UpValDesc upvalues[LJ_MAX_UPVAL];  /* upvalues */
+  uint16_t actvar[LJ_MAX_LOCVAR];  /* declared-variable stack */
+} FuncState;
+
+/* Binary and unary operators. ORDER OPR */
+typedef enum BinOpr {
+  OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,  /* ORDER ARITH */
+  OPR_CONCAT,
+  OPR_NE, OPR_EQ,
+  OPR_LT, OPR_GE, OPR_LE, OPR_GT,
+  OPR_AND, OPR_OR,
+  OPR_NOBINOPR
+} BinOpr;
+
+LJ_STATIC_ASSERT((int)BC_ISGE-(int)BC_ISLT == (int)OPR_GE-(int)OPR_LT);
+LJ_STATIC_ASSERT((int)BC_ISLE-(int)BC_ISLT == (int)OPR_LE-(int)OPR_LT);
+LJ_STATIC_ASSERT((int)BC_ISGT-(int)BC_ISLT == (int)OPR_GT-(int)OPR_LT);
+LJ_STATIC_ASSERT((int)BC_SUBVV-(int)BC_ADDVV == (int)OPR_SUB-(int)OPR_ADD);
+LJ_STATIC_ASSERT((int)BC_MULVV-(int)BC_ADDVV == (int)OPR_MUL-(int)OPR_ADD);
+LJ_STATIC_ASSERT((int)BC_DIVVV-(int)BC_ADDVV == (int)OPR_DIV-(int)OPR_ADD);
+LJ_STATIC_ASSERT((int)BC_MODVV-(int)BC_ADDVV == (int)OPR_MOD-(int)OPR_ADD);
+
+typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
+
+/* -- Error handling ------------------------------------------------------ */
+
+LJ_NORET LJ_NOINLINE static void err_syntax(LexState *ls, ErrMsg em)
+{
+  lj_lex_error(ls, ls->token, em);
+}
+
+LJ_NORET LJ_NOINLINE static void err_token(LexState *ls, LexToken token)
+{
+  lj_lex_error(ls, ls->token, LJ_ERR_XTOKEN, lj_lex_token2str(ls, token));
+}
+
+LJ_NORET static void err_limit(FuncState *fs, uint32_t limit, const char *what)
+{
+  if (fs->pt->linedefined == 0)
+    lj_lex_error(fs->ls, 0, LJ_ERR_XLIMM, limit, what);
+  else
+    lj_lex_error(fs->ls, 0, LJ_ERR_XLIMF, fs->pt->linedefined, limit, what);
+}
+
+#define checklimit(fs, v, l, m)		if ((v) >= (l)) err_limit(fs, l, m)
+#define checklimitgt(fs, v, l, m)	if ((v) > (l)) err_limit(fs, l, m)
+#define checkcond(ls, c, em)		{ if (!(c)) err_syntax(ls, em); }
+
+/* -- Code emitter: branches ---------------------------------------------- */
+
+static BCPos getjump(FuncState *fs, BCPos pc)
+{
+  ptrdiff_t delta = bc_j(fs->pt->bc[pc]);
+  if ((BCPos)delta == NO_JMP)
+    return NO_JMP;
+  else
+    return (BCPos)(((ptrdiff_t)pc+1)+delta);
+}
+
+static int need_value(FuncState *fs, BCPos list)
+{
+  for (; list != NO_JMP; list = getjump(fs, list)) {
+    BCOp op = bc_op(fs->pt->bc[list >= 1 ? list-1 : list]);
+    if (!(op == BC_ISTC || op == BC_ISFC)) return 1;
+  }
+  return 0;  /* Not found. */
+}
+
+static int patchtestreg(FuncState *fs, BCPos pc, BCReg reg)
+{
+  BCIns *i = &fs->pt->bc[pc >= 1 ? pc-1 : pc];
+  BCOp op = bc_op(*i);
+  if (!(op == BC_ISTC || op == BC_ISFC))
+    return 0;  /* cannot patch other instructions */
+  if (reg != NO_REG && reg != bc_d(*i)) {
+    setbc_a(i, reg);
+  } else {  /* no register to put value or register already has the value */
+    setbc_op(i, op+(BC_IST-BC_ISTC));
+    setbc_a(i, 0);
+  }
+  return 1;
+}
+
+static void removevalues(FuncState *fs, BCPos list)
+{
+  for (; list != NO_JMP; list = getjump(fs, list))
+    patchtestreg(fs, list, NO_REG);
+}
+
+static void fixjump(FuncState *fs, BCPos pc, BCPos dest)
+{
+  BCIns *jmp = &fs->pt->bc[pc];
+  BCPos offset = dest-(pc+1)+BCBIAS_J;
+  lua_assert(dest != NO_JMP);
+  if (offset > BCMAX_D)
+    err_syntax(fs->ls, LJ_ERR_XJUMP);
+  setbc_d(jmp, offset);
+}
+
+static void concatjumps(FuncState *fs, BCPos *l1, BCPos l2)
+{
+  if (l2 == NO_JMP) return;
+  else if (*l1 == NO_JMP) {
+    *l1 = l2;
+  } else {
+    BCPos list = *l1;
+    BCPos next;
+    while ((next = getjump(fs, list)) != NO_JMP)  /* find last element */
+      list = next;
+    fixjump(fs, list, l2);
+  }
+}
+
+static void patchlistaux(FuncState *fs, BCPos list, BCPos vtarget,
+			 BCReg reg, BCPos dtarget)
+{
+  while (list != NO_JMP) {
+    BCPos next = getjump(fs, list);
+    if (patchtestreg(fs, list, reg))
+      fixjump(fs, list, vtarget);
+    else
+      fixjump(fs, list, dtarget);  /* jump to default target */
+    list = next;
+  }
+}
+
+static void patchtohere(FuncState *fs, BCPos list)
+{
+  fs->lasttarget = fs->pc;
+  concatjumps(fs, &fs->jpc, list);
+}
+
+static void patchlist(FuncState *fs, BCPos list, BCPos target)
+{
+  if (target == fs->pc) {
+    patchtohere(fs, list);
+  } else {
+    lua_assert(target < fs->pc);
+    patchlistaux(fs, list, target, NO_REG, target);
+  }
+}
+
+/* -- Code emitter: instructions ------------------------------------------ */
+
+static BCPos emitINS(FuncState *fs, BCIns i)
+{
+  GCproto *pt;
+  patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
+  fs->jpc = NO_JMP;
+  pt = fs->pt;
+  if (LJ_UNLIKELY(fs->pc >= pt->sizebc)) {
+    checklimit(fs, fs->pc, LJ_MAX_BCINS, "bytecode instructions");
+    lj_mem_growvec(fs->L, pt->bc, pt->sizebc, LJ_MAX_BCINS, BCIns);
+    lj_mem_growvec(fs->L, pt->lineinfo, pt->sizelineinfo, LJ_MAX_BCINS, BCLine);
+  }
+  pt->bc[fs->pc] = i;
+  pt->lineinfo[fs->pc] = fs->ls->lastline;
+  return fs->pc++;
+}
+
+#define emitABC(fs, o, a, b, c)	emitINS(fs, BCINS_ABC(o, a, b, c))
+#define emitAD(fs, o, a, d)	emitINS(fs, BCINS_AD(o, a, d))
+#define emitAJ(fs, o, a, j)	emitINS(fs, BCINS_AJ(o, a, j))
+
+#define bcptr(fs, e)		(&(fs)->pt->bc[(e)->u.s.info])
+
+static BCPos emit_jump(FuncState *fs)
+{
+  BCPos jpc = fs->jpc;  /* save list of jumps to here */
+  BCPos j = fs->pc - 1;
+  fs->jpc = NO_JMP;
+  if ((int32_t)j >= (int32_t)fs->lasttarget && bc_op(fs->pt->bc[j]) == BC_UCLO)
+    setbc_j(&fs->pt->bc[j], NO_JMP);
+  else
+    j = emitAJ(fs, BC_JMP, fs->freereg, NO_JMP);
+  concatjumps(fs, &j, jpc);  /* keep them on hold */
+  return j;
+}
+
+/* -- Code emitter: constants --------------------------------------------- */
+
+static BCReg numK(FuncState *fs, ExpDesc *e)
+{
+  lua_State *L = fs->L;
+  TValue *val;
+  lua_assert(isnumK(e));
+  val = lj_tab_set(L, fs->kt, &e->u.nval);
+  if (tvisnum(val))
+    return val->u32.lo;
+  val->u64 = fs->nkn;
+  return fs->nkn++;
+}
+
+static BCReg gcK(FuncState *fs, GCobj *gc, int itype)
+{
+  lua_State *L = fs->L;
+  TValue o, *val;
+  setgcV(L, &o, &gc->gch, itype);
+  val = lj_tab_set(L, fs->kt, &o);
+  if (tvisnum(val))
+    return val->u32.lo;
+  val->u64 = fs->nkgc;
+  return fs->nkgc++;
+}
+
+static BCReg strK(FuncState *fs, ExpDesc *e)
+{
+  lua_assert(isstrK(e) || e->k == VGLOBAL);
+  return gcK(fs, obj2gco(e->u.sval), LJ_TSTR);
+}
+
+GCstr *lj_parse_keepstr(LexState *ls, const char *str, size_t len)
+{
+  lua_State *L = ls->L;
+  GCstr *s = lj_str_new(L, str, len);
+  TValue *tv = lj_tab_setstr(L, ls->fs->kt, s);
+  if (tvisnil(tv)) setboolV(tv, 1);  /* Anchor string to avoid GC. */
+  return s;
+}
+
+static void keep_token(LexState *ls)
+{
+  if (ls->token == TK_name || ls->token == TK_string) {
+    TValue *tv = lj_tab_setstr(ls->L, ls->fs->kt, strV(&ls->tokenval));
+    if (tvisnil(tv)) setboolV(tv, 1);  /* Anchor string to avoid GC. */
+  }
+}
+
+static void nilK(FuncState *fs, BCReg from, BCReg n)
+{
+  BCIns *pr;
+  if (fs->pc > fs->lasttarget) {  /* no jumps to current position? */
+    BCReg pfrom, pto;
+    pr = &fs->pt->bc[fs->pc-1];
+    pfrom = bc_a(*pr);
+    switch (bc_op(*pr)) {
+    case BC_KPRI:
+      if (bc_d(*pr) != ~LJ_TNIL) break;
+      if (from == pfrom) {
+	if (n == 1) return;
+      } else if (from == pfrom+1) {
+	from = pfrom;
+	n++;
+      } else {
+	break;
+      }
+      fs->pc--;
+      break;
+    case BC_KNIL:
+      pto = bc_d(*pr);
+      if (pfrom <= from && from <= pto+1) {  /* can connect both? */
+	if (from+n-1 > pto)
+	  setbc_d(pr, from+n-1);
+	return;
+      }
+      break;
+    default:
+      break;
+    }
+  }
+  emitINS(fs, n == 1 ? BCINS_AD(BC_KPRI, from, priKk(VKNIL))
+		     : BCINS_AD(BC_KNIL, from, from+n-1));
+}
+
+/* -- Code emitter: registers --------------------------------------------- */
+
+static void checkframe(FuncState *fs, BCReg n)
+{
+  BCReg sz = fs->freereg + n;
+  if (sz > fs->pt->framesize) {
+    if (sz >= LJ_MAX_SLOTS)
+      err_syntax(fs->ls, LJ_ERR_XSLOTS);
+    fs->pt->framesize = cast_byte(sz);
+  }
+}
+
+static void reserveregs(FuncState *fs, BCReg n)
+{
+  checkframe(fs, n);
+  fs->freereg += n;
+}
+
+static void freereg(FuncState *fs, BCReg reg)
+{
+  if (reg >= fs->nactvar) {
+    fs->freereg--;
+    lua_assert(reg == fs->freereg);
+  }
+}
+
+static void freeexp(FuncState *fs, ExpDesc *e)
+{
+  if (e->k == VNONRELOC)
+    freereg(fs, e->u.s.info);
+}
+
+/* -- Code emitter: expressions ------------------------------------------- */
+
+static void dischargevars(FuncState *fs, ExpDesc *e)
+{
+  BCIns ins;
+  switch (e->k) {
+  case VUPVAL:
+    ins = BCINS_AD(BC_UGET, 0, e->u.s.info);
+    break;
+  case VGLOBAL:
+    ins = BCINS_AD(BC_GGET, 0, strK(fs, e));
+    break;
+  case VINDEXED: {
+    /* TGET[VSB] key = reg, string const or byte const */
+    BCReg rc = e->u.s.aux;
+    if ((int32_t)rc < 0) {
+      ins = BCINS_ABC(BC_TGETS, 0, e->u.s.info, ~rc);
+    } else if (rc > BCMAX_C) {
+      ins = BCINS_ABC(BC_TGETB, 0, e->u.s.info, rc-(BCMAX_C+1));
+    } else {
+      freereg(fs, rc);
+      ins = BCINS_ABC(BC_TGETV, 0, e->u.s.info, rc);
+    }
+    freereg(fs, e->u.s.info);
+    break;
+    }
+  case VCALL:
+    e->u.s.info = e->u.s.aux;
+    /* fallthrough */
+  case VLOCAL:
+    e->k = VNONRELOC;
+    /* fallthrough */
+  default:
+    return;
+  }
+  e->u.s.info = emitINS(fs, ins);
+  e->k = VRELOCABLE;
+}
+
+static void discharge2reg(FuncState *fs, ExpDesc *e, BCReg reg)
+{
+  BCIns ins;
+  dischargevars(fs, e);
+  switch (e->k) {
+  case VKNIL: case VKFALSE:  case VKTRUE:
+    ins = BCINS_AD(BC_KPRI, reg, priK(e));
+    break;
+  case VKSTR:
+    ins = BCINS_AD(BC_KSTR, reg, strK(fs, e));
+    break;
+  case VKNUM: {
+    lua_Number n = expnumV(e);
+    int32_t k = lj_num2int(n);
+    if (checki16(k) && n == cast_num(k))
+      ins = BCINS_AD(BC_KSHORT, reg, (BCReg)(uint16_t)k);
+    else
+      ins = BCINS_AD(BC_KNUM, reg, numK(fs, e));
+    break;
+    }
+  case VRELOCABLE:
+    setbc_a(bcptr(fs, e), reg);
+    goto noins;
+  case VNONRELOC:
+    if (reg == e->u.s.info)
+      goto noins;
+    ins = BCINS_AD(BC_MOV, reg, e->u.s.info);
+    break;
+  default:
+    lua_assert(e->k == VVOID || e->k == VJMP);
+    return;  /* nothing to do... */
+  }
+  emitINS(fs, ins);
+noins:
+  e->u.s.info = reg;
+  e->k = VNONRELOC;
+}
+
+static void exp2reg(FuncState *fs, ExpDesc *e, BCReg reg)
+{
+  discharge2reg(fs, e, reg);
+  if (e->k == VJMP)
+    concatjumps(fs, &e->t, e->u.s.info);  /* put this jump in `t' list */
+  if (hasjumps(e)) {
+    BCPos final;  /* position after whole expression */
+    BCPos p_f = NO_JMP;  /* position of an eventual LOAD false */
+    BCPos p_t = NO_JMP;  /* position of an eventual LOAD true */
+    if (need_value(fs, e->t) || need_value(fs, e->f)) {
+      BCPos fj = (e->k == VJMP) ? NO_JMP : emit_jump(fs);
+      p_f = emitAD(fs, BC_KPRI, reg, priKk(VKFALSE));
+      emitAJ(fs, BC_JMP, fs->freereg, 1);
+      p_t = emitAD(fs, BC_KPRI, reg, priKk(VKTRUE));
+      patchtohere(fs, fj);
+    }
+    final = fs->pc;
+    fs->lasttarget = final;
+    patchlistaux(fs, e->f, final, reg, p_f);
+    patchlistaux(fs, e->t, final, reg, p_t);
+  }
+  e->f = e->t = NO_JMP;
+  e->u.s.info = reg;
+  e->k = VNONRELOC;
+}
+
+static void exp2nextreg(FuncState *fs, ExpDesc *e)
+{
+  dischargevars(fs, e);
+  freeexp(fs, e);
+  reserveregs(fs, 1);
+  exp2reg(fs, e, fs->freereg - 1);
+}
+
+static BCReg exp2anyreg(FuncState *fs, ExpDesc *e)
+{
+  dischargevars(fs, e);
+  if (e->k == VNONRELOC) {
+    if (!hasjumps(e)) return e->u.s.info;  /* exp is already in a register */
+    if (e->u.s.info >= fs->nactvar) {  /* reg. is not a local? */
+      exp2reg(fs, e, e->u.s.info);  /* put value on it */
+      return e->u.s.info;
+    }
+  }
+  exp2nextreg(fs, e);  /* default */
+  return e->u.s.info;
+}
+
+static void exp2val(FuncState *fs, ExpDesc *e)
+{
+  if (hasjumps(e))
+    exp2anyreg(fs, e);
+  else
+    dischargevars(fs, e);
+}
+
+static void storevar(FuncState *fs, ExpDesc *var, ExpDesc *e)
+{
+  BCIns ins;
+  switch (var->k) {
+  case VLOCAL:
+    freeexp(fs, e);
+    exp2reg(fs, e, var->u.s.info);
+    return;
+  case VUPVAL:
+    exp2val(fs, e);
+    switch (e->k) {
+    case VKNIL: case VKFALSE: case VKTRUE:
+      ins = BCINS_AD(BC_USETP, var->u.s.info, priK(e));
+      break;
+    case VKSTR:
+      ins = BCINS_AD(BC_USETS, var->u.s.info, strK(fs, e));
+      break;
+    case VKNUM:
+      ins = BCINS_AD(BC_USETN, var->u.s.info, numK(fs, e));
+      break;
+    default:
+      ins = BCINS_AD(BC_USETV, var->u.s.info, exp2anyreg(fs, e));
+      break;
+    }
+    break;
+  case VGLOBAL: {
+    BCReg ra = exp2anyreg(fs, e);
+    ins = BCINS_AD(BC_GSET, ra, strK(fs, var));
+    break;
+    }
+  case VINDEXED: {
+    /* TSET[VSB] key = reg, string const or byte const */
+    BCReg ra = exp2anyreg(fs, e);
+    BCReg rc = var->u.s.aux;
+    if ((int32_t)rc < 0) {
+      ins = BCINS_ABC(BC_TSETS, ra, var->u.s.info, ~rc);
+    } else if (rc > BCMAX_C) {
+      ins = BCINS_ABC(BC_TSETB, ra, var->u.s.info, rc-(BCMAX_C+1));
+    } else {
+      /* Free late alloced key reg to avoid assert on free of value reg. */
+      /* This can only happen when called from constructor(). */
+      lua_assert(e->k != VNONRELOC || ra < fs->nactvar ||
+		 rc < ra || (freereg(fs, rc),1));
+      ins = BCINS_ABC(BC_TSETV, ra, var->u.s.info, rc);
+    }
+    break;
+    }
+  default:
+    lua_assert(0);  /* invalid var kind to store */
+    return;
+  }
+  emitINS(fs, ins);
+  freeexp(fs, e);
+}
+
+static void indexexp(FuncState *fs, ExpDesc *t, ExpDesc *e)
+{
+  /* already called: exp2val(fs, e) */
+  t->k = VINDEXED;
+  if (isnumK(e)) {
+    lua_Number n = expnumV(e);
+    int32_t k = lj_num2int(n);
+    if (checku8(k) && n == cast_num(k)) {
+      t->u.s.aux = BCMAX_C+1+(uint32_t)k;  /* 256..511: const byte key */
+      return;
+    }
+  } else if (isstrK(e)) {
+    BCReg idx = strK(fs, e);
+    if (idx <= BCMAX_C) {
+      t->u.s.aux = ~idx;  /* -256..-1: const string key */
+      return;
+    }
+  }
+  t->u.s.aux = exp2anyreg(fs, e);  /* 0..255: register */
+}
+
+static void methodexp(FuncState *fs, ExpDesc *e, ExpDesc *key)
+{
+  BCReg idx, func, tab = exp2anyreg(fs, e);
+  freeexp(fs, e);
+  func = fs->freereg;
+  emitAD(fs, BC_MOV, func+1, tab);
+  lua_assert(isstrK(key));
+  idx = strK(fs, key);
+  if (idx <= BCMAX_C) {
+    reserveregs(fs, 2);
+    emitABC(fs, BC_TGETS, func, tab, idx);
+  } else {
+    reserveregs(fs, 3);
+    emitAD(fs, BC_KSTR, func+2, idx);
+    emitABC(fs, BC_TGETV, func, tab, func+2);
+    fs->freereg--;
+  }
+  e->u.s.info = func;
+  e->k = VNONRELOC;
+}
+
+/* -- Code emitter: conditionals ------------------------------------------ */
+
+static void invertjump(FuncState *fs, ExpDesc *e)
+{
+  BCIns *i = bcptr(fs, e) - 1;
+  setbc_op(i, bc_op(*i)^1);
+}
+
+static BCPos jumponcond(FuncState *fs, ExpDesc *e, int cond)
+{
+  if (e->k == VRELOCABLE) {
+    BCIns *i = bcptr(fs, e);
+    if (bc_op(*i) == BC_NOT) {
+      *i = BCINS_AD(cond ? BC_ISF : BC_IST, 0, bc_d(*i));
+      return emit_jump(fs);
+    }
+    /* else go through */
+  }
+  if (e->k != VNONRELOC) {
+    reserveregs(fs, 1);
+    discharge2reg(fs, e, fs->freereg-1);
+  }
+  freeexp(fs, e);
+  emitAD(fs, cond ? BC_ISTC : BC_ISFC, NO_REG, e->u.s.info);
+  return emit_jump(fs);
+}
+
+static void goiftrue(FuncState *fs, ExpDesc *e)
+{
+  BCPos pc;  /* PC of last jump. */
+  dischargevars(fs, e);
+  switch (e->k) {
+  case VKSTR: case VKNUM: case VKTRUE:
+    pc = NO_JMP;  /* always true; do nothing */
+    break;
+  case VJMP:
+    invertjump(fs, e);
+    pc = e->u.s.info;
+    break;
+  case VKFALSE:
+    if (!hasjumps(e)) {
+      pc = emit_jump(fs);  /* always jump */
+      break;
+    }
+    /* fallthrough */
+  default:
+    pc = jumponcond(fs, e, 0);
+    break;
+  }
+  concatjumps(fs, &e->f, pc);  /* insert last jump in `f' list */
+  patchtohere(fs, e->t);
+  e->t = NO_JMP;
+}
+
+static void goiffalse(FuncState *fs, ExpDesc *e)
+{
+  BCPos pc;  /* PC of last jump. */
+  dischargevars(fs, e);
+  switch (e->k) {
+  case VKNIL: case VKFALSE:
+    pc = NO_JMP;  /* always false; do nothing */
+    break;
+  case VJMP:
+    pc = e->u.s.info;
+    break;
+  case VKTRUE:
+    if (!hasjumps(e)) {
+      pc = emit_jump(fs);  /* always jump */
+      break;
+    }
+    /* fallthrough */
+  default:
+    pc = jumponcond(fs, e, 1);
+    break;
+  }
+  concatjumps(fs, &e->t, pc);  /* insert last jump in `t' list */
+  patchtohere(fs, e->f);
+  e->f = NO_JMP;
+}
+
+/* -- Code emitter: operators --------------------------------------------- */
+
+static int foldarith(BinOpr opr, ExpDesc *e1, ExpDesc *e2)
+{
+  TValue o;
+  if (!isnumKexp(e1) || !isnumKexp(e2)) return 0;
+  setnumV(&o, lj_vm_foldarith(expnumV(e1), expnumV(e2), (int)opr-OPR_ADD));
+  if (tvisnan(&o) || tvismzero(&o)) return 0;  /* Avoid NaN and -0 as consts. */
+  setnumV(&e1->u.nval, numV(&o));
+  return 1;
+}
+
+static void codearith(FuncState *fs, BinOpr opr, ExpDesc *e1, ExpDesc *e2)
+{
+  BCReg rb, rc, t;
+  uint32_t op;
+  if (foldarith(opr, e1, e2))
+    return;
+  if (opr == OPR_POW) {
+    op = BC_POW;
+    rc = exp2anyreg(fs, e2);
+    rb = exp2anyreg(fs, e1);
+  } else {
+    op = opr-OPR_ADD+BC_ADDVV;
+    /* must discharge 2nd operand first since VINDEXED might free regs */
+    exp2val(fs, e2);
+    if (isnumK(e2) && (rc = numK(fs, e2)) <= BCMAX_C)
+      op -= BC_ADDVV-BC_ADDVN;
+    else
+      rc = exp2anyreg(fs, e2);
+    /* emit_prebinop discharges 1st operand, but may need to use KNUM/KSHORT */
+    lua_assert(isnumK(e1) || e1->k == VNONRELOC);
+    exp2val(fs, e1);
+    /* avoid two consts to satisfy bytecode constraints */
+    if (isnumK(e1) && !isnumK(e2) && (t = numK(fs, e1)) <= BCMAX_B) {
+      rb = rc; rc = t; op -= BC_ADDVV-BC_ADDNV;
+    } else {
+      rb = exp2anyreg(fs, e1);
+    }
+  }
+  /* using freeexp might cause asserts if the order is wrong */
+  if (e1->k == VNONRELOC && e1->u.s.info >= fs->nactvar) fs->freereg--;
+  if (e2->k == VNONRELOC && e2->u.s.info >= fs->nactvar) fs->freereg--;
+  e1->u.s.info = emitABC(fs, op, 0, rb, rc);
+  e1->k = VRELOCABLE;
+}
+
+static void codecomp(FuncState *fs, BinOpr opr, ExpDesc *e1, ExpDesc *e2)
+{
+  ExpDesc *eret = e1;
+  BCIns ins;
+  exp2val(fs, e1);
+  if (opr == OPR_EQ || opr == OPR_NE) {
+    BCOp op = opr == OPR_EQ ? BC_ISEQV : BC_ISNEV;
+    BCReg ra;
+    if (isK(e1)) { e1 = e2; e2 = eret; }  /* need constant in 2nd arg */
+    ra = exp2anyreg(fs, e1);  /* first arg must be in a reg */
+    exp2val(fs, e2);
+    switch (e2->k) {
+    case VKNIL: case VKFALSE: case VKTRUE:
+      ins = BCINS_AD(op+(BC_ISEQP-BC_ISEQV), ra, priK(e2));
+      break;
+    case VKSTR:
+      ins = BCINS_AD(op+(BC_ISEQS-BC_ISEQV), ra, strK(fs, e2));
+      break;
+    case VKNUM:
+      ins = BCINS_AD(op+(BC_ISEQN-BC_ISEQV), ra, numK(fs, e2));
+      break;
+    default:
+      ins = BCINS_AD(op, ra, exp2anyreg(fs, e2));
+      break;
+    }
+  } else {
+    uint32_t op = opr-OPR_LT+BC_ISLT;
+    BCReg ra;
+    if ((op-BC_ISLT) & 1) {  /* GT -> LT, GE -> LE */
+      e1 = e2; e2 = eret;  /* swap operands */
+      op = ((op-BC_ISLT)^3)+BC_ISLT;
+    }
+    ra = exp2anyreg(fs, e1);
+    ins = BCINS_AD(op, ra, exp2anyreg(fs, e2));
+  }
+  /* using freeexp might cause asserts if the order is wrong */
+  if (e1->k == VNONRELOC && e1->u.s.info >= fs->nactvar) fs->freereg--;
+  if (e2->k == VNONRELOC && e2->u.s.info >= fs->nactvar) fs->freereg--;
+  emitINS(fs, ins);
+  eret->u.s.info = emit_jump(fs);
+  eret->k = VJMP;
+}
+
+static void emit_unop(FuncState *fs, UnOpr uop, ExpDesc *e)
+{
+  BCOp op = BC_LEN;
+  switch (uop) {
+  case OPR_MINUS:
+    if (isnumKexp(e) && expnumV(e) != 0) {  /* Avoid const-folding to -0. */
+      setnumV(&e->u.nval, -expnumV(e));
+      return;
+    }
+    op = BC_UNM;
+    /* fallthrough */
+  case OPR_LEN:
+    exp2anyreg(fs, e);
+    break;
+  case OPR_NOT:
+    /* interchange true and false lists */
+    { BCPos temp = e->f; e->f = e->t; e->t = temp; }
+    removevalues(fs, e->f);
+    removevalues(fs, e->t);
+    dischargevars(fs, e);
+    switch (e->k) {
+    case VKNIL: case VKFALSE:
+      e->k = VKTRUE;
+      return;
+    case VKSTR: case VKNUM: case VKTRUE:
+      e->k = VKFALSE;
+      return;
+    case VJMP:
+      invertjump(fs, e);
+      return;
+    case VRELOCABLE:
+      reserveregs(fs, 1);
+      setbc_a(bcptr(fs, e), fs->freereg-1);
+      e->u.s.info = fs->freereg-1;
+      e->k = VNONRELOC;
+      break;
+    case VNONRELOC:
+      break;
+    default: lua_assert(0); return;
+    }
+    op = BC_NOT;
+    break;
+  default: lua_assert(0); return;
+  }
+  freeexp(fs, e);
+  e->u.s.info = emitAD(fs, op, 0, e->u.s.info);
+  e->k = VRELOCABLE;
+}
+
+static void prepare_binop(FuncState *fs, BinOpr op, ExpDesc *e)
+{
+  switch (op) {
+  case OPR_AND:
+    goiftrue(fs, e);
+    break;
+  case OPR_OR:
+    goiffalse(fs, e);
+    break;
+  case OPR_CONCAT:
+    exp2nextreg(fs, e);  /* operand must be on the `stack' */
+    break;
+  case OPR_EQ: case OPR_NE:
+    if (!isKexp(e)) exp2anyreg(fs, e);
+    break;
+  default:
+    if (!isnumKexp(e)) exp2anyreg(fs, e);
+    break;
+  }
+}
+
+static void emit_binop(FuncState *fs, BinOpr op, ExpDesc *e1, ExpDesc *e2)
+{
+  switch (op) {
+  case OPR_AND:
+    lua_assert(e1->t == NO_JMP);  /* list must be closed */
+    dischargevars(fs, e2);
+    concatjumps(fs, &e2->f, e1->f);
+    *e1 = *e2;
+    break;
+  case OPR_OR:
+    lua_assert(e1->f == NO_JMP);  /* list must be closed */
+    dischargevars(fs, e2);
+    concatjumps(fs, &e2->t, e1->t);
+    *e1 = *e2;
+    break;
+  case OPR_CONCAT:
+    exp2val(fs, e2);
+    if (e2->k == VRELOCABLE && bc_op(*bcptr(fs, e2)) == BC_CAT) {
+      lua_assert(e1->u.s.info == bc_b(*bcptr(fs, e2))-1);
+      freeexp(fs, e1);
+      setbc_b(bcptr(fs, e2), e1->u.s.info);
+      e1->u.s.info = e2->u.s.info;
+    } else {
+      exp2nextreg(fs, e2);
+      freeexp(fs, e2);
+      freeexp(fs, e1);
+      e1->u.s.info = emitABC(fs, BC_CAT, 0, e1->u.s.info, e2->u.s.info);
+    }
+    e1->k = VRELOCABLE;
+    break;
+  case OPR_ADD: case OPR_SUB: case OPR_MUL:
+  case OPR_DIV: case OPR_MOD: case OPR_POW:
+    codearith(fs, op, e1, e2);
+    break;
+  case OPR_EQ: case OPR_NE:
+  case OPR_LT: case OPR_LE: case OPR_GT: case OPR_GE:
+    codecomp(fs, op, e1, e2);
+    break;
+  default: lua_assert(0); break;
+  }
+}
+
+/* -- Lexer support ------------------------------------------------------- */
+
+static int testnext(LexState *ls, LexToken tok)
+{
+  if (ls->token == tok) {
+    lj_lex_next(ls);
+    return 1;
+  }
+  return 0;
+}
+
+static void checknext(LexState *ls, LexToken tok)
+{
+  if (ls->token != tok)
+    err_token(ls, tok);
+  lj_lex_next(ls);
+}
+
+static void checkmatch(LexState *ls, LexToken what, LexToken who, BCLine line)
+{
+  if (!testnext(ls, what)) {
+    if (line == ls->linenumber) {
+      err_token(ls, what);
+    } else {
+      const char *swhat = lj_lex_token2str(ls, what);
+      const char *swho = lj_lex_token2str(ls, who);
+      lj_lex_error(ls, ls->token, LJ_ERR_XMATCH, swhat, swho, line);
+    }
+  }
+}
+
+static GCstr *str_checkname(LexState *ls)
+{
+  GCstr *s;
+  if (ls->token != TK_name)
+    err_token(ls, TK_name);
+  s = strV(&ls->tokenval);
+  lj_lex_next(ls);
+  return s;
+}
+
+static void init_exp(ExpDesc *e, ExpKind k, uint32_t info)
+{
+  e->k = k;
+  e->u.s.info = info;
+  e->f = e->t = NO_JMP;
+}
+
+static void checkname(LexState *ls, ExpDesc *e)
+{
+  init_exp(e, VKSTR, 0);
+  e->u.sval = str_checkname(ls);
+}
+
+/* -- Variable handling --------------------------------------------------- */
+
+#define getlocvar(fs, i)	((fs)->pt->varinfo[(fs)->actvar[(i)]])
+
+static BCReg registerlocalvar(LexState *ls, GCstr *name)
+{
+  FuncState *fs = ls->fs;
+  GCproto *pt = fs->pt;
+  if (LJ_UNLIKELY(fs->nlocvars >= pt->sizevarinfo)) {
+    MSize oldsize = pt->sizevarinfo;
+    checklimit(fs, fs->nlocvars, 32767, "local variables");
+    lj_mem_growvec(fs->L, pt->varinfo, pt->sizevarinfo, 32767, VarInfo);
+    while (oldsize < pt->sizevarinfo) pt->varinfo[oldsize++].name = NULL;
+  }
+  pt->varinfo[fs->nlocvars].name = name;
+  lj_gc_objbarrier(ls->L, pt, name);
+  return fs->nlocvars++;
+}
+
+static void new_localvar(LexState *ls, GCstr *name, BCReg n)
+{
+  FuncState *fs = ls->fs;
+  checklimit(fs, fs->nactvar+n, LJ_MAX_LOCVAR, "local variables");
+  fs->actvar[fs->nactvar+n] = cast(uint16_t, registerlocalvar(ls, name));
+}
+
+#define new_localvarliteral(ls,v,n) \
+  new_localvar(ls, lj_parse_keepstr(ls, "" v, sizeof(v)-1), n)
+
+static void adjustlocalvars(LexState *ls, BCReg nvars)
+{
+  FuncState *fs = ls->fs;
+  fs->nactvar = cast_byte(fs->nactvar + nvars);
+  for (; nvars; nvars--)
+    getlocvar(fs, fs->nactvar - nvars).startpc = fs->pc;
+}
+
+static void removevars(LexState *ls, BCReg tolevel)
+{
+  FuncState *fs = ls->fs;
+  while (fs->nactvar > tolevel)
+    getlocvar(fs, --fs->nactvar).endpc = fs->pc;
+}
+
+static uint32_t indexupvalue(FuncState *fs, GCstr *name, ExpDesc *v)
+{
+  uint32_t i;
+  GCproto *pt = fs->pt;
+  for (i = 0; i < fs->nuv; i++) {
+    if (fs->upvalues[i].k == v->k && fs->upvalues[i].info == v->u.s.info) {
+      lua_assert(pt->uvname[i] == name);
+      return i;
+    }
+  }
+  /* Not found, create a new upvalue for this name. */
+  if (LJ_UNLIKELY(fs->nuv >= pt->sizeuvname)) {
+    MSize oldsize = pt->sizeuvname;
+    checklimit(fs, fs->nuv, LJ_MAX_UPVAL, "upvalues");
+    lj_mem_growvec(fs->L, pt->uvname, pt->sizeuvname, LJ_MAX_UPVAL, GCstr *);
+    while (oldsize < pt->sizeuvname) pt->uvname[oldsize++] = NULL;
+  }
+  pt->uvname[fs->nuv] = name;
+  lj_gc_objbarrier(fs->L, pt, name);
+  lua_assert(v->k == VLOCAL || v->k == VUPVAL);
+  fs->upvalues[fs->nuv].k = cast_byte(v->k);
+  fs->upvalues[fs->nuv].info = cast_byte(v->u.s.info);
+  return fs->nuv++;
+}
+
+static BCReg searchvar(FuncState *fs, GCstr *n)
+{
+  int i;
+  for (i = fs->nactvar-1; i >= 0; i--) {
+    if (n == getlocvar(fs, i).name)
+      return (BCReg)i;
+  }
+  return (BCReg)-1;  /* Not found. */
+}
+
+static void markupval(FuncState *fs, BCReg level)
+{
+  FuncBlock *bl = fs->bl;
+  while (bl && bl->nactvar > level) bl = bl->previous;
+  if (bl) bl->upval = 1;
+}
+
+static int singlevaraux(FuncState *fs, GCstr *name, ExpDesc *e, int first)
+{
+  if (fs == NULL) {  /* no more levels? */
+    init_exp(e, VGLOBAL, 0);  /* default is global variable */
+    e->u.sval = name;
+    return 1;
+  } else {
+    BCReg reg = searchvar(fs, name);  /* look up at current level */
+    if ((int32_t)reg >= 0) {
+      init_exp(e, VLOCAL, reg);
+      if (!first)
+	markupval(fs, reg);  /* local will be used as an upval */
+      return 0;
+    } else {  /* not found at current level; try upper one */
+      if (singlevaraux(fs->prev, name, e, 0))  /* global? */
+	return 1;
+      e->u.s.info = indexupvalue(fs, name, e);  /* else was local or upvalue */
+      e->k = VUPVAL;  /* upvalue in this level */
+      return 0;
+    }
+  }
+}
+
+#define singlevar(ls, e) singlevaraux((ls)->fs, str_checkname(ls), (e), 1)
+
+static void adjust_assign(LexState *ls, BCReg nvars, BCReg nexps, ExpDesc *e)
+{
+  FuncState *fs = ls->fs;
+  int32_t extra = (int32_t)nvars - (int32_t)nexps;
+  if (e->k == VCALL) {
+    extra++;  /* includes call itself */
+    if (extra < 0) extra = 0;
+    setbc_b(bcptr(fs, e), extra+1);
+    if (extra > 1) reserveregs(fs, (BCReg)extra-1);
+  } else {
+    if (e->k != VVOID) exp2nextreg(fs, e);  /* close last expression */
+    if (extra > 0) {
+      BCReg reg = fs->freereg;
+      reserveregs(fs, (BCReg)extra);
+      nilK(fs, reg, (BCReg)extra);
+    }
+  }
+}
+
+/* -- Function handling --------------------------------------------------- */
+
+/* Forward declaration. */
+static void chunk(LexState *ls);
+
+static void open_func(LexState *ls, FuncState *fs)
+{
+  lua_State *L = ls->L;
+  GCproto *pt = lj_func_newproto(L);
+  fs->pt = pt;
+  fs->prev = ls->fs;  /* linked list of funcstates */
+  fs->ls = ls;
+  fs->L = L;
+  ls->fs = fs;
+  fs->pc = 0;
+  fs->lasttarget = 0;
+  fs->jpc = NO_JMP;
+  fs->freereg = 0;
+  fs->nkgc = 0;
+  fs->nkn = 0;
+  fs->nlocvars = 0;
+  fs->nactvar = 0;
+  fs->nuv = 0;
+  fs->bl = NULL;
+  pt->chunkname = ls->chunkname;
+  pt->framesize = 2;  /* registers 0/1 are always valid */
+  fs->kt = lj_tab_new(L, 0, 0);
+  /* anchor table of constants and prototype (to avoid being collected) */
+  settabV(L, L->top, fs->kt);
+  incr_top(L);
+  setprotoV(L, L->top, pt);
+  incr_top(L);
+}
+
+static void collectk(FuncState *fs, GCproto *pt)
+{
+  GCtab *kt;
+  TValue *array;
+  Node *node;
+  BCReg nkgc;
+  MSize i, hmask, sizek;
+  GCRef *kstart;
+  checklimitgt(fs, fs->nkn, BCMAX_D+1, "constants");
+  checklimitgt(fs, fs->nkgc, BCMAX_D+1, "constants");
+  nkgc = round_nkgc(fs->nkgc);
+  sizek = (MSize)(nkgc*sizeof(MRef) + fs->nkn*sizeof(lua_Number));
+  kstart = lj_mem_newt(fs->L, sizek, GCRef);
+  if (nkgc) setgcrefnull(kstart[0]);  /* May be uninitialized otherwise. */
+  pt->k.gc = kstart + nkgc;
+  pt->sizekn = fs->nkn;
+  pt->sizekgc = fs->nkgc;
+  kt = fs->kt;
+  array = tvref(kt->array);
+  for (i = 0; i < kt->asize; i++)
+    if (tvisnum(&array[i]))
+      pt->k.n[array[i].u32.lo] = cast_num(i);
+  node = noderef(kt->node);
+  hmask = kt->hmask;
+  for (i = 0; i <= hmask; i++) {
+    Node *n = &node[i];
+    if (tvisnum(&n->val)) {
+      ptrdiff_t kidx = (ptrdiff_t)n->val.u32.lo;
+      if (tvisnum(&n->key)) {
+	pt->k.n[kidx] = numV(&n->key);
+      } else {
+	GCobj *o = gcV(&n->key);
+	setgcref(pt->k.gc[~kidx], o);
+	lj_gc_objbarrier(fs->L, pt, o);
+      }
+    }
+  }
+}
+
+static void collectuv(FuncState *fs, GCproto *pt)
+{
+  uint32_t i;
+  pt->uv = lj_mem_newvec(fs->L, fs->nuv, int16_t);
+  pt->sizeuv = fs->nuv;
+  for (i = 0; i < pt->sizeuv; i++) {
+    uint32_t v = fs->upvalues[i].info;
+    if (fs->upvalues[i].k == VUPVAL) v = ~v;
+    pt->uv[i] = (int16_t)v;
+  }
+}
+
+static void finalret(FuncState *fs, GCproto *pt)
+{
+  BCPos lastpc = fs->pc;
+  if (lastpc > fs->lasttarget) {
+    switch (bc_op(pt->bc[lastpc-1])) {
+    case BC_CALLMT: case BC_CALLT:
+    case BC_RETM: case BC_RET: case BC_RET0: case BC_RET1:
+      goto suppress_return;  /* already got a return */
+    default: break;
+    }
+  }
+  if (fs->pt->flags & PROTO_HAS_FNEW)
+    emitAJ(fs, BC_UCLO, 0, 0);
+  emitAD(fs, BC_RET0, 0, 1);  /* final return */
+suppress_return:
+  /* may need to fixup returns encoded before first function was created */
+  if (fs->pt->flags & PROTO_FIXUP_RETURN) {
+    BCPos pc;
+    for (pc = 0; pc < lastpc; pc++) {
+      BCIns i = pt->bc[pc];
+      BCPos offset;
+      switch (bc_op(i)) {
+      case BC_CALLMT: case BC_CALLT:
+      case BC_RETM: case BC_RET: case BC_RET0: case BC_RET1:
+	offset = emitINS(fs, i)-(pc+1)+BCBIAS_J;  /* copy return ins */
+	if (offset > BCMAX_D)
+	  err_syntax(fs->ls, LJ_ERR_XFIXUP);
+	pt->bc[pc] = BCINS_AD(BC_UCLO, 0, offset);  /* replace w/ UCLO+branch */
+	break;
+      case BC_UCLO: return;  /* we're done */
+      default: break;
+      }
+    }
+  }
+}
+
+static void close_func(LexState *ls)
+{
+  lua_State *L = ls->L;
+  FuncState *fs = ls->fs;
+  GCproto *pt = fs->pt;
+  removevars(ls, 0);
+  finalret(fs, pt);
+  lj_mem_reallocvec(L, pt->bc, pt->sizebc, fs->pc, BCIns);
+  pt->sizebc = fs->pc;
+  collectk(fs, pt);
+  collectuv(fs, pt);
+  lj_mem_reallocvec(L, pt->lineinfo, pt->sizelineinfo, fs->pc, BCLine);
+  pt->sizelineinfo = fs->pc;
+  lj_mem_reallocvec(L, pt->varinfo, pt->sizevarinfo, fs->nlocvars, VarInfo);
+  pt->sizevarinfo = fs->nlocvars;
+  lj_mem_reallocvec(L, pt->uvname, pt->sizeuvname, fs->nuv, GCstr *);
+  pt->sizeuvname = fs->nuv;
+  lua_assert(fs->bl == NULL);
+  lj_vmevent_send(L, BC,
+    setprotoV(L, L->top++, pt);
+  );
+  ls->fs = fs->prev;
+  L->top -= 2;  /* Remove table and prototype from the stack. */
+  lua_assert(ls->fs != NULL || ls->token == TK_eof);
+  keep_token(ls);  /* Re-anchor last token. */
+}
+
+GCproto *lj_parse(LexState *ls)
+{
+  struct FuncState fs;
+  ls->level = 0;
+  open_func(ls, &fs);
+  fs.pt->flags |= PROTO_IS_VARARG;  /* Main chunk is always a vararg func. */
+  lj_lex_next(ls);  /* Read-ahead first token. */
+  chunk(ls);
+  if (ls->token != TK_eof)
+    err_token(ls, TK_eof);
+  fs.pt->lastlinedefined = ls->linenumber;
+  close_func(ls);
+  lua_assert(fs.prev == NULL);
+  lua_assert(fs.pt->sizeuv == 0);
+  lua_assert(ls->fs == NULL);
+  return fs.pt;
+}
+
+/* -- Expressions --------------------------------------------------------- */
+
+/* forward declaration */
+static void expr(LexState *ls, ExpDesc *v);
+
+static void field(LexState *ls, ExpDesc *v)
+{
+  /* field -> ['.' | ':'] NAME */
+  FuncState *fs = ls->fs;
+  ExpDesc key;
+  exp2anyreg(fs, v);
+  lj_lex_next(ls);  /* skip the dot or colon */
+  checkname(ls, &key);
+  indexexp(fs, v, &key);
+}
+
+static void yindex(LexState *ls, ExpDesc *v)
+{
+  /* index -> '[' expr ']' */
+  lj_lex_next(ls);  /* skip the '[' */
+  expr(ls, v);
+  exp2val(ls->fs, v);
+  checknext(ls, ']');
+}
+
+static void kexp2tv(TValue *v, ExpDesc *e)
+{
+  switch (e->k) {
+  case VKNIL: case VKFALSE: case VKTRUE: v->it = ~(int32_t)e->k; break;
+  case VKSTR:
+    setgcref(v->gcr, obj2gco(e->u.sval)); v->it = LJ_TSTR; break;
+  case VKNUM: setnumV(v, expnumV(e)); break;
+  default: lua_assert(0); break;
+  }
+}
+
+static void constructor(LexState *ls, ExpDesc *e)
+{
+  FuncState *fs = ls->fs;
+  BCLine line = ls->linenumber;
+  GCtab *t = NULL;
+  int vcall = 0, needarr = 0;
+  int32_t narr = 1;  /* first array index */
+  uint32_t nhash = 0;  /* number of hash entries */
+  BCReg freg = fs->freereg;
+  BCPos pc = emitAD(fs, BC_TNEW, freg, 0);
+  init_exp(e, VNONRELOC, freg);
+  reserveregs(fs, 1);
+  freg++;
+  checknext(ls, '{');
+  while (ls->token != '}') {
+    ExpDesc key, val;
+    vcall = 0;
+    if (ls->token == '[') {
+      yindex(ls, &key);  /* already calls exp2val */
+      if (!isK(&key)) indexexp(fs, e, &key);
+      if (isnumK(&key) && expnumV(&key) == 0) needarr = 1; else nhash++;
+      checknext(ls, '=');
+    } else if (ls->token == TK_name && lj_lex_lookahead(ls) == '=') {
+      checkname(ls, &key);
+      checknext(ls, '=');
+      nhash++;
+    } else {
+      init_exp(&key, VKNUM, 0);
+      setintV(&key.u.nval, narr);
+      narr++;
+      needarr = vcall = 1;
+    }
+    expr(ls, &val);
+    if (isKexp(&val) && isK(&key) && key.k != VKNIL) {
+      TValue k;
+      if (!t) {  /* create template table on demand */
+	BCReg kidx;
+	t = lj_tab_new(fs->L, 0, 0);
+	kidx = gcK(fs, obj2gco(t), LJ_TTAB);
+	fs->pt->bc[pc] = BCINS_AD(BC_TDUP, freg-1, kidx);
+      }
+      vcall = 0;
+      kexp2tv(&k, &key);
+      kexp2tv(lj_tab_set(fs->L, t, &k), &val);
+      if (val.k == VKSTR)
+	lj_gc_objbarriert(fs->L, t, val.u.sval);
+    } else {
+      if (isK(&key)) indexexp(fs, e, &key);
+      if (val.k != VCALL) vcall = 0;
+      storevar(fs, e, &val);
+    }
+    fs->freereg = freg;
+    if (!testnext(ls, ',') && !testnext(ls, ';')) break;
+  }
+  checkmatch(ls, '}', '{', line);
+  if (vcall) {
+    BCIns *i = &fs->pt->bc[fs->pc-1];
+    ExpDesc en;
+    lua_assert(bc_a(*i)==freg && bc_op(*i) == (narr>256?BC_TSETV:BC_TSETB));
+    init_exp(&en, VKNUM, 0);
+    setintV(&en.u.nval, narr-1);
+    if (narr > 256) { fs->pc--; i--; }
+    *i = BCINS_AD(BC_TSETM, freg, numK(fs, &en));
+    setbc_b(i-1, 0);
+  }
+  if (pc == fs->pc-1) {  /* make expr relocable if possible */
+    e->u.s.info = pc;
+    fs->freereg--;
+    e->k = VRELOCABLE;
+  } else {
+    e->k = VNONRELOC;  /* indexexp may have changed it */
+  }
+  if (!t) {  /* Construct TNEW RD: hhhhhaaaaaaaaaaa. */
+    if (!needarr) narr = 0;
+    else if (narr < 3) narr = 3;
+    else if (narr > 0x7ff) narr = 0x7ff;
+    setbc_d(&fs->pt->bc[pc], (uint32_t)narr | (hsize2hbits(nhash) << 11));
+  }
+}
+
+static void parlist(LexState *ls)
+{
+  /* parlist -> [ param { `,' param } ] */
+  FuncState *fs = ls->fs;
+  GCproto *pt = fs->pt;
+  BCReg nparams = 0;
+  if (ls->token != ')') {  /* is `parlist' not empty? */
+    do {
+      switch (ls->token) {
+      case TK_name:  /* param -> NAME */
+	new_localvar(ls, str_checkname(ls), nparams++);
+	break;
+      case TK_dots:  /* param -> `...' */
+	lj_lex_next(ls);
+	pt->flags |= PROTO_IS_VARARG;
+	break;
+      default:
+	err_syntax(ls, LJ_ERR_XPARAM);
+	break;
+      }
+    } while (!(pt->flags & PROTO_IS_VARARG) && testnext(ls, ','));
+  }
+  adjustlocalvars(ls, nparams);
+  pt->numparams = cast_byte(fs->nactvar);
+  reserveregs(fs, fs->nactvar);  /* reserve register for parameters */
+}
+
+static void body(LexState *ls, ExpDesc *e, int needself, BCLine line)
+{
+  /* body ->  `(' parlist `)' chunk END */
+  FuncState *fs, new_fs;
+  BCReg kidx;
+  open_func(ls, &new_fs);
+  new_fs.pt->linedefined = line;
+  checknext(ls, '(');
+  if (needself) {
+    new_localvarliteral(ls, "self", 0);
+    adjustlocalvars(ls, 1);
+  }
+  parlist(ls);
+  checknext(ls, ')');
+  chunk(ls);
+  new_fs.pt->lastlinedefined = ls->linenumber;
+  checkmatch(ls, TK_end, TK_function, line);
+  close_func(ls);
+  fs = ls->fs;
+  kidx = gcK(fs, obj2gco(new_fs.pt), LJ_TPROTO);
+  init_exp(e, VRELOCABLE, emitAD(fs, BC_FNEW, 0, kidx));
+  if (!(fs->pt->flags & PROTO_HAS_FNEW)) {
+    if (fs->pt->flags & PROTO_HAS_RETURN)
+      fs->pt->flags |= PROTO_FIXUP_RETURN;
+    fs->pt->flags |= PROTO_HAS_FNEW;
+  }
+}
+
+static BCReg explist1(LexState *ls, ExpDesc *v)
+{
+  /* explist1 -> expr { `,' expr } */
+  BCReg n = 1;  /* at least one expression */
+  expr(ls, v);
+  while (testnext(ls, ',')) {
+    exp2nextreg(ls->fs, v);
+    expr(ls, v);
+    n++;
+  }
+  return n;
+}
+
+static void funcargs(LexState *ls, ExpDesc *e)
+{
+  FuncState *fs = ls->fs;
+  ExpDesc args;
+  BCIns ins;
+  BCReg base;
+  BCLine line = ls->linenumber;
+  switch (ls->token) {
+    case '(': {  /* funcargs -> `(' [ explist1 ] `)' */
+      if (line != ls->lastline)
+	err_syntax(ls, LJ_ERR_XAMBIG);
+      lj_lex_next(ls);
+      if (ls->token == ')') {  /* arg list is empty? */
+	args.k = VVOID;
+      } else {
+	explist1(ls, &args);
+	if (args.k == VCALL)
+	  setbc_b(bcptr(fs, &args), 0);
+      }
+      checkmatch(ls, ')', '(', line);
+      break;
+    }
+    case '{': {  /* funcargs -> constructor */
+      constructor(ls, &args);
+      break;
+    }
+    case TK_string: {  /* funcargs -> STRING */
+      init_exp(&args, VKSTR, 0);
+      args.u.sval = strV(&ls->tokenval);
+      lj_lex_next(ls);  /* must use `seminfo' before `next' */
+      break;
+    }
+    default: {
+      err_syntax(ls, LJ_ERR_XFUNARG);
+      return;
+    }
+  }
+  lua_assert(e->k == VNONRELOC);
+  base = e->u.s.info;  /* base register for call */
+  if (args.k == VCALL) {
+    ins = BCINS_ABC(BC_CALLM, base, 2, args.u.s.aux - base - 1);
+  } else {
+    if (args.k != VVOID)
+      exp2nextreg(fs, &args);  /* close last argument */
+    ins = BCINS_ABC(BC_CALL, base, 2, fs->freereg - base);
+  }
+  init_exp(e, VCALL, emitINS(fs, ins));
+  e->u.s.aux = base;
+  fs->pt->lineinfo[fs->pc - 1] = line;
+  fs->freereg = base+1;  /* call removes function and arguments and leaves
+			    (unless changed) one result */
+}
+
+static void prefixexp(LexState *ls, ExpDesc *v)
+{
+  /* prefixexp -> NAME | '(' expr ')' */
+  switch (ls->token) {
+    case '(': {
+      BCLine line = ls->linenumber;
+      lj_lex_next(ls);
+      expr(ls, v);
+      checkmatch(ls, ')', '(', line);
+      dischargevars(ls->fs, v);
+      return;
+    }
+    case TK_name: {
+      singlevar(ls, v);
+      return;
+    }
+    default: {
+      err_syntax(ls, LJ_ERR_XSYMBOL);
+      return;
+    }
+  }
+}
+
+static void primaryexp(LexState *ls, ExpDesc *v)
+{
+  /* primaryexp ->
+	prefixexp { `.' NAME | `[' exp `]' | `:' NAME funcargs | funcargs } */
+  FuncState *fs = ls->fs;
+  prefixexp(ls, v);
+  for (;;) {
+    switch (ls->token) {
+      case '.':  /* field */
+	field(ls, v);
+	break;
+      case '[': {  /* `[' exp1 `]' */
+	ExpDesc key;
+	exp2anyreg(fs, v);
+	yindex(ls, &key);
+	indexexp(fs, v, &key);
+	break;
+      }
+      case ':': {  /* `:' NAME funcargs */
+	ExpDesc key;
+	lj_lex_next(ls);
+	checkname(ls, &key);
+	methodexp(fs, v, &key);
+	funcargs(ls, v);
+	break;
+      }
+      case '(': case TK_string: case '{':  /* funcargs */
+	exp2nextreg(fs, v);
+	funcargs(ls, v);
+	break;
+      default: return;
+    }
+  }
+}
+
+static void simpleexp(LexState *ls, ExpDesc *v)
+{
+  /* simpleexp -> NUMBER | STRING | NIL | true | false | ... |
+		  constructor | FUNCTION body | primaryexp */
+  switch (ls->token) {
+  case TK_number:
+    init_exp(v, VKNUM, 0);
+    setnumV(&v->u.nval, numV(&ls->tokenval));
+    break;
+  case TK_string:
+    init_exp(v, VKSTR, 0);
+    v->u.sval = strV(&ls->tokenval);
+    break;
+  case TK_nil:
+    init_exp(v, VKNIL, 0);
+    break;
+  case TK_true:
+    init_exp(v, VKTRUE, 0);
+    break;
+  case TK_false:
+    init_exp(v, VKFALSE, 0);
+    break;
+  case TK_dots: {  /* vararg */
+    FuncState *fs = ls->fs;
+    BCReg base;
+    checkcond(ls, fs->pt->flags & PROTO_IS_VARARG, LJ_ERR_XDOTS);
+    reserveregs(fs, 1);
+    base = fs->freereg-1;
+    init_exp(v, VCALL, emitABC(fs, BC_VARG, base, 2, 1));
+    v->u.s.aux = base;
+    break;
+  }
+  case '{':  /* constructor */
+    constructor(ls, v);
+    return;
+  case TK_function:
+    lj_lex_next(ls);
+    body(ls, v, 0, ls->linenumber);
+    return;
+  default:
+    primaryexp(ls, v);
+    return;
+  }
+  lj_lex_next(ls);
+}
+
+static void enterlevel(LexState *ls)
+{
+  if (++ls->level >= LJ_MAX_XLEVEL)
+    lj_lex_error(ls, 0, LJ_ERR_XLEVELS);
+}
+
+#define leavelevel(ls)	((ls)->level--)
+
+static UnOpr getunopr(LexToken tok)
+{
+  switch (tok) {
+  case TK_not: return OPR_NOT;
+  case '-': return OPR_MINUS;
+  case '#': return OPR_LEN;
+  default: return OPR_NOUNOPR;
+  }
+}
+
+static BinOpr getbinopr(LexToken tok)
+{
+  switch (tok) {
+  case '+': return OPR_ADD;
+  case '-': return OPR_SUB;
+  case '*': return OPR_MUL;
+  case '/': return OPR_DIV;
+  case '%': return OPR_MOD;
+  case '^': return OPR_POW;
+  case TK_concat: return OPR_CONCAT;
+  case TK_ne: return OPR_NE;
+  case TK_eq: return OPR_EQ;
+  case '<': return OPR_LT;
+  case TK_le: return OPR_LE;
+  case '>': return OPR_GT;
+  case TK_ge: return OPR_GE;
+  case TK_and: return OPR_AND;
+  case TK_or: return OPR_OR;
+  default: return OPR_NOBINOPR;
+  }
+}
+
+static const struct {
+  uint8_t left;  /* left priority for each binary operator */
+  uint8_t right; /* right priority */
+} priority[] = {  /* ORDER OPR */
+  {6,6}, {6,6}, {7,7}, {7,7}, {7,7},	/* ADD SUB MUL DIV MOD */
+  {10,9}, {5,4},			/* POW CONCAT (right associative) */
+  {3,3}, {3,3},				/* EQ NE */
+  {3,3}, {3,3}, {3,3}, {3,3},		/* LT GE GT LE */
+  {2,2}, {1,1}				/* AND OR */
+};
+
+#define UNARY_PRIORITY	8  /* priority for unary operators */
+
+/*
+** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
+** where `binop' is any binary operator with a priority higher than `limit'
+*/
+static BinOpr subexpr(LexState *ls, ExpDesc *v, uint32_t limit)
+{
+  BinOpr op;
+  UnOpr uop;
+  enterlevel(ls);
+  uop = getunopr(ls->token);
+  if (uop != OPR_NOUNOPR) {
+    lj_lex_next(ls);
+    subexpr(ls, v, UNARY_PRIORITY);
+    emit_unop(ls->fs, uop, v);
+  } else {
+    simpleexp(ls, v);
+  }
+  /* expand while operators have priorities higher than `limit' */
+  op = getbinopr(ls->token);
+  while (op != OPR_NOBINOPR && priority[op].left > limit) {
+    ExpDesc v2;
+    BinOpr nextop;
+    lj_lex_next(ls);
+    prepare_binop(ls->fs, op, v);
+    /* read sub-expression with higher priority */
+    nextop = subexpr(ls, &v2, priority[op].right);
+    emit_binop(ls->fs, op, v, &v2);
+    op = nextop;
+  }
+  leavelevel(ls);
+  return op;  /* return first untreated operator */
+}
+
+static void expr(LexState *ls, ExpDesc *v)
+{
+  subexpr(ls, v, 0);
+}
+
+static BCPos condexpr(LexState *ls)
+{
+  /* cond -> exp */
+  ExpDesc v;
+  expr(ls, &v);  /* read condition */
+  if (v.k == VKNIL) v.k = VKFALSE;  /* `falses' are all equal here */
+  goiftrue(ls->fs, &v);
+  return v.f;
+}
+
+/* -- Scope handling ------------------------------------------------------ */
+
+static void enterblock(FuncState *fs, FuncBlock *bl, int isbreakable)
+{
+  bl->breaklist = NO_JMP;
+  bl->isbreakable = (uint8_t)isbreakable;
+  bl->nactvar = fs->nactvar;
+  bl->upval = 0;
+  bl->previous = fs->bl;
+  fs->bl = bl;
+  lua_assert(fs->freereg == fs->nactvar);
+}
+
+static void leaveblock(FuncState *fs)
+{
+  FuncBlock *bl = fs->bl;
+  fs->bl = bl->previous;
+  removevars(fs->ls, bl->nactvar);
+  fs->freereg = fs->nactvar;  /* free registers */
+  lua_assert(bl->nactvar == fs->nactvar);
+  /* a block either controls scope or breaks (never both) */
+  lua_assert(!bl->isbreakable || !bl->upval);
+  if (bl->upval)
+    emitAJ(fs, BC_UCLO, bl->nactvar, 0);
+  else  /* avoid in upval case, it clears lasttarget and kills UCLO+JMP join */
+    patchtohere(fs, bl->breaklist);
+}
+
+static void block(LexState *ls)
+{
+  /* block -> chunk */
+  FuncState *fs = ls->fs;
+  FuncBlock bl;
+  enterblock(fs, &bl, 0);
+  chunk(ls);
+  lua_assert(bl.breaklist == NO_JMP);
+  leaveblock(fs);
+}
+
+/* -- Statements ---------------------------------------------------------- */
+
+/*
+** structure to chain all variables in the left-hand side of an
+** assignment
+*/
+struct LHS_assign {
+  ExpDesc v;  /* variable (global, local, upvalue, or indexed) */
+  struct LHS_assign *prev;
+};
+
+/*
+** check whether, in an assignment to a local variable, the local variable
+** is needed in a previous assignment (to a table). If so, save original
+** local value in a safe place and use this safe copy in the previous
+** assignment.
+*/
+static void check_conflict(LexState *ls, struct LHS_assign *lh,
+			   const ExpDesc *v)
+{
+  FuncState *fs = ls->fs;
+  BCReg reg = fs->freereg;  /* eventual position to save local variable */
+  int conflict = 0;
+  for (; lh; lh = lh->prev) {
+    if (lh->v.k == VINDEXED) {
+      if (lh->v.u.s.info == v->u.s.info) {  /* conflict? */
+	conflict = 1;
+	lh->v.u.s.info = reg;  /* previous assignment will use safe copy */
+      }
+      if (lh->v.u.s.aux == v->u.s.info) {  /* conflict? */
+	conflict = 1;
+	lh->v.u.s.aux = reg;  /* previous assignment will use safe copy */
+      }
+    }
+  }
+  if (conflict) {
+    emitAD(fs, BC_MOV, reg, v->u.s.info);  /* make copy */
+    reserveregs(fs, 1);
+  }
+}
+
+static void assignment(LexState *ls, struct LHS_assign *lh, BCReg nvars)
+{
+  ExpDesc e;
+  checkcond(ls, VLOCAL <= lh->v.k && lh->v.k <= VINDEXED, LJ_ERR_XSYNTAX);
+  if (testnext(ls, ',')) {  /* assignment -> `,' primaryexp assignment */
+    struct LHS_assign nv;
+    nv.prev = lh;
+    primaryexp(ls, &nv.v);
+    if (nv.v.k == VLOCAL)
+      check_conflict(ls, lh, &nv.v);
+    checklimit(ls->fs, ls->level + nvars, LJ_MAX_XLEVEL, "variable names");
+    assignment(ls, &nv, nvars+1);
+  } else {  /* assignment -> `=' explist1 */
+    BCReg nexps;
+    checknext(ls, '=');
+    nexps = explist1(ls, &e);
+    if (nexps == nvars) {
+      if (e.k == VCALL) {
+	if (bc_op(*bcptr(ls->fs, &e)) == BC_VARG) {
+	  ls->fs->freereg--;
+	  e.k = VRELOCABLE;
+	} else {
+	  e.u.s.info = e.u.s.aux;
+	  e.k = VNONRELOC;
+	}
+      }
+      storevar(ls->fs, &lh->v, &e);
+      return;
+    }
+    adjust_assign(ls, nvars, nexps, &e);
+    if (nexps > nvars)
+      ls->fs->freereg -= nexps - nvars;  /* remove extra values */
+  }
+  init_exp(&e, VNONRELOC, ls->fs->freereg-1);  /* default assignment */
+  storevar(ls->fs, &lh->v, &e);
+}
+
+static void breakstat(LexState *ls)
+{
+  FuncState *fs = ls->fs;
+  FuncBlock *bl = fs->bl;
+  int upval = 0;
+  while (bl && !bl->isbreakable) {
+    upval |= bl->upval;
+    bl = bl->previous;
+  }
+  if (!bl)
+    err_syntax(ls, LJ_ERR_XBREAK);
+  if (upval)
+    emitAJ(fs, BC_UCLO, bl->nactvar, 0);
+  concatjumps(fs, &bl->breaklist, emit_jump(fs));
+}
+
+static void whilestat(LexState *ls, BCLine line)
+{
+  /* whilestat -> WHILE cond DO block END */
+  FuncState *fs = ls->fs;
+  BCPos start, loop, condexit;
+  FuncBlock bl;
+  lj_lex_next(ls);  /* skip WHILE */
+  start = fs->lasttarget = fs->pc;
+  condexit = condexpr(ls);
+  enterblock(fs, &bl, 1);
+  checknext(ls, TK_do);
+  loop = emitAD(fs, BC_LOOP, fs->nactvar, 0);
+  block(ls);
+  patchlist(fs, emit_jump(fs), start);
+  checkmatch(ls, TK_end, TK_while, line);
+  leaveblock(fs);
+  patchtohere(fs, condexit);  /* false conditions finish the loop */
+  fixjump(fs, loop, fs->pc);
+}
+
+static void repeatstat(LexState *ls, BCLine line)
+{
+  /* repeatstat -> REPEAT block UNTIL cond */
+  FuncState *fs = ls->fs;
+  BCPos loop = fs->lasttarget = fs->pc;
+  BCPos condexit;
+  FuncBlock bl1, bl2;
+  enterblock(fs, &bl1, 1);  /* loop block */
+  enterblock(fs, &bl2, 0);  /* scope block */
+  lj_lex_next(ls);  /* skip REPEAT */
+  emitAD(fs, BC_LOOP, fs->nactvar, 0);
+  chunk(ls);
+  checkmatch(ls, TK_until, TK_repeat, line);
+  condexit = condexpr(ls);  /* read condition (inside scope block) */
+  if (!bl2.upval) {  /* no upvalues? */
+    leaveblock(fs);  /* finish scope */
+  } else {  /* complete semantics when there are upvalues */
+    breakstat(ls);  /* if condition then break */
+    patchtohere(fs, condexit);  /* else... */
+    leaveblock(fs);  /* finish scope... */
+    condexit = emit_jump(fs);  /* and repeat */
+  }
+  patchlist(fs, condexit, loop);  /* close the loop */
+  fixjump(fs, loop, fs->pc);
+  leaveblock(fs);  /* finish loop */
+}
+
+static void exp1(LexState *ls)
+{
+  ExpDesc e;
+  expr(ls, &e);
+  exp2nextreg(ls->fs, &e);
+}
+
+static void forbody(LexState *ls, BCReg base, BCLine line, BCReg nvars,
+		    int isnum)
+{
+  /* forbody -> DO block */
+  FuncBlock bl;
+  FuncState *fs = ls->fs;
+  BCPos loop, loopend;
+  adjustlocalvars(ls, 3);  /* control variables */
+  checknext(ls, TK_do);
+  loop = isnum ? emitAJ(fs, BC_FORI, base, NO_JMP) :
+		 emitAJ(fs, BC_JMP, fs->freereg, NO_JMP);
+  enterblock(fs, &bl, 0);  /* scope for declared variables */
+  adjustlocalvars(ls, nvars);
+  reserveregs(fs, nvars);
+  block(ls);
+  leaveblock(fs);  /* end of scope for declared variables */
+  if (isnum) {
+    loopend = emitAJ(fs, BC_FORL, base, NO_JMP);
+    fixjump(fs, loop, fs->pc);
+  } else {
+    fixjump(fs, loop, fs->pc);
+    emitABC(fs, BC_ITERC, base+3, nvars+1, 2+1);
+    loopend = emitAJ(fs, BC_ITERL, base+3, NO_JMP);
+    fs->pt->lineinfo[loopend-1] = line;
+  }
+  fs->pt->lineinfo[loopend] = line;  /* pretend last op starts the loop */
+  fixjump(fs, loopend, loop+1);
+}
+
+static void fornum(LexState *ls, GCstr *varname, BCLine line)
+{
+  /* fornum -> NAME = exp1,exp1[,exp1] forbody */
+  FuncState *fs = ls->fs;
+  BCReg base = fs->freereg;
+  new_localvarliteral(ls, "(for index)", FORL_IDX);
+  new_localvarliteral(ls, "(for limit)", FORL_STOP);
+  new_localvarliteral(ls, "(for step)", FORL_STEP);
+  new_localvar(ls, varname, FORL_EXT);
+  checknext(ls, '=');
+  exp1(ls);  /* initial value */
+  checknext(ls, ',');
+  exp1(ls);  /* limit */
+  if (testnext(ls, ',')) {
+    exp1(ls);  /* optional step */
+  } else {  /* default step = 1 */
+    emitAD(fs, BC_KSHORT, fs->freereg, 1);
+    reserveregs(fs, 1);
+  }
+  forbody(ls, base, line, 1, 1);
+}
+
+static void forlist(LexState *ls, GCstr *indexname)
+{
+  /* forlist -> NAME {,NAME} IN explist1 forbody */
+  FuncState *fs = ls->fs;
+  ExpDesc e;
+  BCReg nvars = 0;
+  BCLine line;
+  BCReg base = fs->freereg;
+  /* create control variables */
+  new_localvarliteral(ls, "(for generator)", nvars++);
+  new_localvarliteral(ls, "(for state)", nvars++);
+  new_localvarliteral(ls, "(for control)", nvars++);
+  /* create declared variables */
+  new_localvar(ls, indexname, nvars++);
+  while (testnext(ls, ','))
+    new_localvar(ls, str_checkname(ls), nvars++);
+  checknext(ls, TK_in);
+  line = ls->linenumber;
+  adjust_assign(ls, 3, explist1(ls, &e), &e);
+  checkframe(fs, 3);  /* extra space to call generator */
+  forbody(ls, base, line, nvars - 3, 0);
+}
+
+static void forstat(LexState *ls, BCLine line)
+{
+  /* forstat -> FOR (fornum | forlist) END */
+  FuncState *fs = ls->fs;
+  GCstr *varname;
+  FuncBlock bl;
+  enterblock(fs, &bl, 1);  /* scope for loop and control variables */
+  lj_lex_next(ls);  /* skip `for' */
+  varname = str_checkname(ls);  /* first variable name */
+  switch (ls->token) {
+    case '=': fornum(ls, varname, line); break;
+    case ',': case TK_in: forlist(ls, varname); break;
+    default: err_syntax(ls, LJ_ERR_XFOR);
+  }
+  checkmatch(ls, TK_end, TK_for, line);
+  leaveblock(fs);  /* loop scope (`break' jumps to this point) */
+}
+
+static BCPos test_then_block(LexState *ls)
+{
+  /* test_then_block -> [IF | ELSEIF] cond THEN block */
+  BCPos condexit;
+  lj_lex_next(ls);  /* skip IF or ELSEIF */
+  condexit = condexpr(ls);
+  checknext(ls, TK_then);
+  block(ls);  /* `then' part */
+  return condexit;
+}
+
+static void ifstat(LexState *ls, BCLine line)
+{
+  /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
+  FuncState *fs = ls->fs;
+  BCPos flist;
+  BCPos escapelist = NO_JMP;
+  flist = test_then_block(ls);  /* IF cond THEN block */
+  while (ls->token == TK_elseif) {
+    concatjumps(fs, &escapelist, emit_jump(fs));
+    patchtohere(fs, flist);
+    flist = test_then_block(ls);  /* ELSEIF cond THEN block */
+  }
+  if (ls->token == TK_else) {
+    concatjumps(fs, &escapelist, emit_jump(fs));
+    patchtohere(fs, flist);
+    lj_lex_next(ls);  /* skip ELSE (after patch, for correct line info) */
+    block(ls);  /* `else' part */
+  } else {
+    concatjumps(fs, &escapelist, flist);
+  }
+  patchtohere(fs, escapelist);
+  checkmatch(ls, TK_end, TK_if, line);
+}
+
+static void localfunc(LexState *ls)
+{
+  ExpDesc v, b;
+  FuncState *fs = ls->fs;
+  new_localvar(ls, str_checkname(ls), 0);
+  init_exp(&v, VLOCAL, fs->freereg);
+  reserveregs(fs, 1);
+  adjustlocalvars(ls, 1);
+  body(ls, &b, 0, ls->linenumber);
+  storevar(fs, &v, &b);
+  /* debug information will only see the variable after this point! */
+  getlocvar(fs, fs->nactvar - 1).startpc = fs->pc;
+}
+
+static void localstat(LexState *ls)
+{
+  /* stat -> LOCAL NAME {`,' NAME} [`=' explist1] */
+  BCReg nvars = 0;
+  BCReg nexps;
+  ExpDesc e;
+  do {
+    new_localvar(ls, str_checkname(ls), nvars++);
+  } while (testnext(ls, ','));
+  if (testnext(ls, '=')) {
+    nexps = explist1(ls, &e);
+  } else {
+    e.k = VVOID;
+    nexps = 0;
+  }
+  adjust_assign(ls, nvars, nexps, &e);
+  adjustlocalvars(ls, nvars);
+}
+
+static int func_name(LexState *ls, ExpDesc *v)
+{
+  /* func_name -> NAME {field} [`:' NAME] */
+  int needself = 0;
+  singlevar(ls, v);
+  while (ls->token == '.')
+    field(ls, v);
+  if (ls->token == ':') {
+    needself = 1;
+    field(ls, v);
+  }
+  return needself;
+}
+
+static void funcstat(LexState *ls, BCLine line)
+{
+  /* funcstat -> FUNCTION func_name body */
+  FuncState *fs;
+  int needself;
+  ExpDesc v, b;
+  lj_lex_next(ls);  /* skip FUNCTION */
+  needself = func_name(ls, &v);
+  body(ls, &b, needself, line);
+  fs = ls->fs;
+  storevar(fs, &v, &b);
+  fs->pt->lineinfo[fs->pc - 1] = line;
+}
+
+static void exprstat(LexState *ls)
+{
+  /* stat -> func | assignment */
+  FuncState *fs = ls->fs;
+  struct LHS_assign v;
+  primaryexp(ls, &v.v);
+  if (v.v.k == VCALL) {  /* stat -> func */
+    setbc_b(bcptr(fs, &v.v), 1);  /* call statement uses no results */
+  } else {  /* stat -> assignment */
+    v.prev = NULL;
+    assignment(ls, &v, 1);
+  }
+}
+
+static int block_follow(LexToken token)
+{
+  switch (token) {
+  case TK_else: case TK_elseif: case TK_end: case TK_until: case TK_eof:
+    return 1;
+  default:
+    return 0;
+  }
+}
+
+static void retstat(LexState *ls)
+{
+  /* stat -> RETURN explist */
+  BCIns ins;
+  FuncState *fs = ls->fs;
+  lj_lex_next(ls);  /* skip RETURN */
+  fs->pt->flags |= PROTO_HAS_RETURN;
+  if (block_follow(ls->token) || ls->token == ';') {
+    ins = BCINS_AD(BC_RET0, 0, 1);  /* return no values */
+  } else {
+    ExpDesc e;
+    BCReg nret = explist1(ls, &e);  /* optional return values */
+    if (nret == 1) {
+      if (e.k == VCALL) {
+	BCIns *i = bcptr(fs, &e);
+	/* It doesn't pay off to add BC_VARGT just for 'return ...'. */
+	if (bc_op(*i) == BC_VARG) goto notailcall;
+	fs->pc--;
+	ins = BCINS_AD(bc_op(*i)-BC_CALL+BC_CALLT, bc_a(*i), bc_c(*i));
+      } else {
+	ins = BCINS_AD(BC_RET1, exp2anyreg(fs, &e), 2);
+      }
+    } else {
+      if (e.k == VCALL) {
+      notailcall:
+	setbc_b(bcptr(fs, &e), 0);
+	ins = BCINS_AD(BC_RETM, fs->nactvar, e.u.s.aux - fs->nactvar);
+      } else {
+	exp2nextreg(fs, &e);  /* values must go to the `stack' */
+	ins = BCINS_AD(BC_RET, fs->nactvar, nret+1);
+      }
+    }
+  }
+  if (fs->pt->flags & PROTO_HAS_FNEW)
+    emitAJ(fs, BC_UCLO, 0, 0);
+  emitINS(fs, ins);
+}
+
+static int statement(LexState *ls)
+{
+  BCLine line = ls->linenumber;  /* may be needed for error messages */
+  switch (ls->token) {
+  case TK_if:
+    ifstat(ls, line);
+    return 0;
+  case TK_while:
+    whilestat(ls, line);
+    return 0;
+  case TK_do:
+    lj_lex_next(ls);  /* skip DO */
+    block(ls);
+    checkmatch(ls, TK_end, TK_do, line);
+    return 0;
+  case TK_for:
+    forstat(ls, line);
+    return 0;
+  case TK_repeat:
+    repeatstat(ls, line);
+    return 0;
+  case TK_function:
+    funcstat(ls, line);
+    return 0;
+  case TK_local:
+    lj_lex_next(ls);  /* skip LOCAL */
+    if (testnext(ls, TK_function))  /* local function? */
+      localfunc(ls);
+    else
+      localstat(ls);
+    return 0;
+  case TK_return:
+    retstat(ls);
+    return 1;  /* must be last statement */
+  case TK_break:
+    lj_lex_next(ls);  /* skip BREAK */
+    breakstat(ls);
+    return 1;  /* must be last statement */
+  default:
+    exprstat(ls);
+    return 0;
+  }
+}
+
+static void chunk(LexState *ls)
+{
+  /* chunk -> { stat [`;'] } */
+  int islast = 0;
+  enterlevel(ls);
+  while (!islast && !block_follow(ls->token)) {
+    islast = statement(ls);
+    testnext(ls, ';');
+    lua_assert(ls->fs->pt->framesize >= ls->fs->freereg &&
+	       ls->fs->freereg >= ls->fs->nactvar);
+    ls->fs->freereg = ls->fs->nactvar;  /* free registers */
+  }
+  leavelevel(ls);
+}
+

+ 15 - 0
src/lj_parse.h

@@ -0,0 +1,15 @@
+/*
+** Lua parser (source code -> bytecode).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_PARSE_H
+#define _LJ_PARSE_H
+
+#include "lj_obj.h"
+#include "lj_lex.h"
+
+LJ_FUNC GCproto *lj_parse(LexState *ls);
+LJ_FUNC GCstr *lj_parse_keepstr(LexState *ls, const char *str, size_t l);
+
+#endif

+ 2136 - 0
src/lj_record.c

@@ -0,0 +1,2136 @@
+/*
+** Trace recorder (bytecode -> SSA IR).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_record_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_bc.h"
+#include "lj_ff.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_record.h"
+#include "lj_snap.h"
+#include "lj_asm.h"
+#include "lj_dispatch.h"
+#include "lj_vm.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)			(&J->cur.ir[(ref)])
+
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+
+/* Emit raw IR without passing through optimizations. */
+#define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+
+/* Context for recording an indexed load/store. */
+typedef struct RecordIndex {
+  TValue tabv;		/* Runtime value of table (or indexed object). */
+  TValue keyv;		/* Runtime value of key. */
+  TValue valv;		/* Runtime value of stored value. */
+  TValue mobjv;		/* Runtime value of metamethod object. */
+  GCtab *mtv;		/* Runtime value of metatable object. */
+  cTValue *oldv;	/* Runtime value of previously stored value. */
+  TRef tab;		/* Table (or indexed object) reference. */
+  TRef key;		/* Key reference. */
+  TRef val;		/* Value reference for a store or 0 for a load. */
+  TRef mt;		/* Metatable reference. */
+  TRef mobj;		/* Metamethod object reference. */
+  int idxchain;		/* Index indirections left or 0 for raw lookup. */
+} RecordIndex;
+
+/* Requested results from rec_call(). */
+enum {
+  /* Non-negative numbers are number of requested results. */
+  CALLRES_MULTI = -1,		/* Return multiple results. */
+  CALLRES_TAILCALL = -2,	/* Tail call. */
+  CALLRES_PENDING = -3,		/* Call is pending, no results yet. */
+  CALLRES_CONT = -4		/* Continuation call. */
+};
+
+/* Forward declarations. */
+static TRef rec_idx(jit_State *J, RecordIndex *ix);
+static int rec_call(jit_State *J, BCReg func, int cres, int nargs);
+
+/* -- Sanity checks ------------------------------------------------------- */
+
+#ifdef LUA_USE_ASSERT
+/* Sanity check the whole IR -- sloooow. */
+static void rec_check_ir(jit_State *J)
+{
+  IRRef i, nins = J->cur.nins, nk = J->cur.nk;
+  lua_assert(nk <= REF_BIAS && nins >= REF_BIAS && nins < 65536);
+  for (i = nins-1; i >= nk; i--) {
+    IRIns *ir = IR(i);
+    uint32_t mode = lj_ir_mode[ir->o];
+    IRRef op1 = ir->op1;
+    IRRef op2 = ir->op2;
+    switch (irm_op1(mode)) {
+    case IRMnone: lua_assert(op1 == 0); break;
+    case IRMref: lua_assert(op1 >= nk);
+      lua_assert(i >= REF_BIAS ? op1 < i : op1 > i); break;
+    case IRMlit: break;
+    case IRMcst: lua_assert(i < REF_BIAS); continue;
+    }
+    switch (irm_op2(mode)) {
+    case IRMnone: lua_assert(op2 == 0); break;
+    case IRMref: lua_assert(op2 >= nk);
+      lua_assert(i >= REF_BIAS ? op2 < i : op2 > i); break;
+    case IRMlit: break;
+    case IRMcst: lua_assert(0); break;
+    }
+    if (ir->prev) {
+      lua_assert(ir->prev >= nk);
+      lua_assert(i >= REF_BIAS ? ir->prev < i : ir->prev > i);
+      lua_assert(IR(ir->prev)->o == ir->o);
+    }
+  }
+}
+
+/* Sanity check the slots. */
+static void rec_check_slots(jit_State *J)
+{
+  BCReg s, nslots = J->baseslot + J->maxslot;
+  lua_assert(J->baseslot >= 1 && J->baseslot < LJ_MAX_JSLOTS);
+  lua_assert(nslots < LJ_MAX_JSLOTS);
+  for (s = 0; s < nslots; s++) {
+    TRef tr = J->slot[s];
+    if (tr) {
+      IRRef ref = tref_ref(tr);
+      lua_assert(ref >= J->cur.nk && ref < J->cur.nins);
+      lua_assert(irt_t(IR(ref)->t) == tref_t(tr));
+    }
+  }
+}
+#endif
+
+/* -- Type handling and specialization ------------------------------------ */
+
+/* Note: these functions return tagged references (TRef). */
+
+/* Specialize a slot to a specific type. Note: slot can be negative! */
+static TRef sloadt(jit_State *J, int32_t slot, IRType t, int mode)
+{
+  /* No guard, since none of the callers need a type-checking SLOAD. */
+  TRef ref = emitir_raw(IRT(IR_SLOAD, t), (int32_t)J->baseslot+slot, mode);
+  J->base[slot] = ref;
+  return ref;
+}
+
+/* Specialize a slot to the runtime type. Note: slot can be negative! */
+static TRef sload(jit_State *J, int32_t slot)
+{
+  IRType t = itype2irt(&J->L->base[slot]);
+  TRef ref = emitir_raw(IRTG(IR_SLOAD, t), (int32_t)J->baseslot+slot, 0);
+  if (irtype_ispri(t)) ref = TREF_PRI(t);  /* Canonicalize primitive refs. */
+  J->base[slot] = ref;
+  return ref;
+}
+
+/* Get TRef from slot. Load slot and specialize if not done already. */
+#define getslot(J, s)	(J->base[(s)] ? J->base[(s)] : sload(J, (int32_t)(s)))
+
+/* Get TRef for current function. */
+static TRef getcurrf(jit_State *J)
+{
+  if (J->base[-1]) {
+    IRIns *ir = IR(tref_ref(J->base[-1]));
+    if (ir->o == IR_FRAME)  /* Shortcut if already specialized. */
+      return TREF(ir->op2, IRT_FUNC);  /* Return TRef of KFUNC. */
+    return J->base[-1];
+  } else {
+    lua_assert(J->baseslot == 1);
+    return sloadt(J, -1, IRT_FUNC, IRSLOAD_READONLY);
+  }
+}
+
+/* Compare for raw object equality.
+** Returns 0 if the objects are the same.
+** Returns 1 if they are different, but the same type.
+** Returns 2 for two different types.
+** Comparisons between primitives always return 1 -- no caller cares about it.
+*/
+static int rec_objcmp(jit_State *J, TRef a, TRef b, cTValue *av, cTValue *bv)
+{
+  int diff = !lj_obj_equal(av, bv);
+  if (!tref_isk2(a, b)) {  /* Shortcut, also handles primitives. */
+    IRType ta = tref_type(a);
+    IRType tb = tref_type(b);
+    if (ta != tb) {
+      /* Widen mixed number/int comparisons to number/number comparison. */
+      if (ta == IRT_INT && tb == IRT_NUM) {
+	a = emitir(IRTN(IR_TONUM), a, 0);
+	ta = IRT_NUM;
+      } else if (ta == IRT_NUM && tb == IRT_INT) {
+	b = emitir(IRTN(IR_TONUM), b, 0);
+      } else {
+	return 2;  /* Two different types are never equal. */
+      }
+    }
+    emitir(IRTG(diff ? IR_NE : IR_EQ, ta), a, b);
+  }
+  return diff;
+}
+
+/* -- Record loop ops ----------------------------------------------------- */
+
+/* Loop event. */
+typedef enum {
+  LOOPEV_LEAVE,		/* Loop is left or not entered. */
+  LOOPEV_ENTER		/* Loop is entered. */
+} LoopEvent;
+
+/* Canonicalize slots: convert integers to numbers. */
+static void canonicalize_slots(jit_State *J)
+{
+  BCReg s;
+  for (s = J->baseslot+J->maxslot-1; s >= 1; s--) {
+    TRef tr = J->slot[s];
+    if (tref_isinteger(tr)) {
+      IRIns *ir = IR(tref_ref(tr));
+      if (!(ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_READONLY)))
+	J->slot[s] = emitir(IRTN(IR_TONUM), tr, 0);
+    }
+  }
+}
+
+/* Stop recording. */
+static void rec_stop(jit_State *J, TraceNo lnk)
+{
+  lj_trace_end(J);
+  J->cur.link = (uint16_t)lnk;
+  if (lnk == J->curtrace) {  /* Looping back? */
+    if ((J->flags & JIT_F_OPT_LOOP))  /* Shall we try to create a loop? */
+      goto nocanon;  /* Do not canonicalize or we lose the narrowing. */
+    if (J->cur.root)  /* Otherwise ensure we always link to the root trace. */
+      J->cur.link = J->cur.root;
+  }
+  canonicalize_slots(J);
+nocanon:
+  /* Note: all loop ops must set J->pc to the following instruction! */
+  lj_snap_add(J);  /* Add loop snapshot. */
+  J->needsnap = 0;
+  J->mergesnap = 1;  /* In case recording continues. */
+}
+
+/* Peek before FORI to find a const initializer, otherwise load from slot. */
+static TRef fori_arg(jit_State *J, const BCIns *pc, BCReg slot, IRType t)
+{
+  /* A store to slot-1 means there's no conditional assignment for slot. */
+  if (bc_a(pc[-1]) == slot-1 && bcmode_a(bc_op(pc[-1])) == BCMdst) {
+    BCIns ins = pc[0];
+    if (bc_a(ins) == slot) {
+      if (bc_op(ins) == BC_KSHORT) {
+	int32_t k = (int32_t)(int16_t)bc_d(ins);
+	if (t == IRT_INT)
+	  return lj_ir_kint(J, k);
+	else
+	  return lj_ir_knum(J, cast_num(k));
+      } else if (bc_op(ins) == BC_KNUM) {
+	lua_Number n = J->pt->k.n[bc_d(ins)];
+	if (t == IRT_INT)
+	  return lj_ir_kint(J, lj_num2int(n));
+	else
+	  return lj_ir_knum(J, n);
+      }
+    }
+  }
+  if (J->base[slot])
+    return J->base[slot];
+  else
+    return sloadt(J, (int32_t)slot, t, IRSLOAD_READONLY|IRSLOAD_INHERIT);
+}
+
+/* Simulate the runtime behavior of the FOR loop iterator.
+** It's important to exactly reproduce the semantics of the interpreter.
+*/
+static LoopEvent for_iter(jit_State *J, IROp *op, BCReg ra, int isforl)
+{
+  cTValue *forbase = &J->L->base[ra];
+  lua_Number stopv = numV(&forbase[FORL_STOP]);
+  lua_Number idxv = numV(&forbase[FORL_IDX]);
+  if (isforl)
+    idxv += numV(&forbase[FORL_STEP]);
+  if ((int32_t)forbase[FORL_STEP].u32.hi >= 0) {
+    if (idxv <= stopv) { *op = IR_LE; return LOOPEV_ENTER; }
+    *op = IR_GT; return LOOPEV_LEAVE;
+  } else {
+    if (stopv <= idxv) { *op = IR_GE; return LOOPEV_ENTER; }
+    *op = IR_LT; return LOOPEV_LEAVE;
+  }
+}
+
+/* Record FORL/JFORL or FORI/JFORI. */
+static LoopEvent rec_for(jit_State *J, const BCIns *fori, int isforl)
+{
+  BCReg ra = bc_a(*fori);
+  IROp op;
+  LoopEvent ev = for_iter(J, &op, ra, isforl);
+  TRef *tr = &J->base[ra];
+  TRef idx, stop;
+  IRType t;
+  if (isforl) {  /* Handle FORL/JFORL opcodes. */
+    TRef step;
+    idx = tr[FORL_IDX];
+    if (!idx) idx = sloadt(J, (int32_t)(ra+FORL_IDX), IRT_NUM, 0);
+    t = tref_type(idx);
+    stop = fori_arg(J, fori-2, ra+FORL_STOP, t);
+    step = fori_arg(J, fori-1, ra+FORL_STEP, t);
+    tr[FORL_IDX] = idx = emitir(IRT(IR_ADD, t), idx, step);
+  } else {  /* Handle FORI/JFORI opcodes. */
+    BCReg i;
+    t = IRT_NUM;
+    for (i = FORL_IDX; i <= FORL_STEP; i++) {
+      lua_assert(J->base[ra+i] != 0);  /* Assumes the slots are already set. */
+      tr[i] = lj_ir_tonum(J, J->base[ra+i]);
+    }
+    idx = tr[FORL_IDX];
+    stop = tr[FORL_STOP];
+    if (!tref_isk(tr[FORL_STEP]))  /* Non-const step: need direction guard. */
+      emitir(IRTG(((op-IR_LT)>>1)+IR_LT, IRT_NUM),
+	     tr[FORL_STEP], lj_ir_knum_zero(J));
+  }
+
+  tr[FORL_EXT] = idx;
+  if (ev == LOOPEV_LEAVE) {
+    J->maxslot = ra+FORL_EXT+1;
+    J->pc = fori+1;
+  } else {
+    J->maxslot = ra;
+    J->pc = fori+bc_j(*fori)+1;
+  }
+  lj_snap_add(J);
+
+  emitir(IRTG(op, t), idx, stop);
+
+  if (ev == LOOPEV_LEAVE) {
+    J->maxslot = ra;
+    J->pc = fori+bc_j(*fori)+1;
+  } else {
+    J->maxslot = ra+FORL_EXT+1;
+    J->pc = fori+1;
+  }
+  J->needsnap = 1;
+  return ev;
+}
+
+/* Record ITERL/JITERL. */
+static LoopEvent rec_iterl(jit_State *J, const BCIns iterins)
+{
+  BCReg ra = bc_a(iterins);
+  lua_assert(J->base[ra] != 0);
+  if (!tref_isnil(J->base[ra])) {  /* Looping back? */
+    J->base[ra-1] = J->base[ra];  /* Copy result of ITERC to control var. */
+    J->maxslot = ra-1+bc_b(J->pc[-1]);
+    J->pc += bc_j(iterins)+1;
+    return LOOPEV_ENTER;
+  } else {
+    J->maxslot = ra-3;
+    J->pc++;
+    return LOOPEV_LEAVE;
+  }
+}
+
+/* Record LOOP/JLOOP. Now, that was easy. */
+static LoopEvent rec_loop(jit_State *J, BCReg ra)
+{
+  J->maxslot = ra;
+  J->pc++;
+  return LOOPEV_ENTER;
+}
+
+/* Check if a loop repeatedly failed to trace because it didn't loop back. */
+static int innerloopleft(jit_State *J, const BCIns *pc)
+{
+  ptrdiff_t i;
+  for (i = 0; i < PENALTY_SLOTS; i++)
+    if (J->penalty[i].pc == pc) {
+      if (J->penalty[i].reason == LJ_TRERR_LLEAVE &&
+	  J->penalty[i].val >= 2*HOTCOUNT_MIN_PENALTY)
+	return 1;
+      break;
+    }
+  return 0;
+}
+
+/* Handle the case when an interpreted loop op is hit. */
+static void rec_loop_interp(jit_State *J, const BCIns *pc, LoopEvent ev)
+{
+  if (J->parent == 0) {
+    if (pc == J->startpc && J->framedepth == 0) {  /* Same loop? */
+      if (ev == LOOPEV_LEAVE)  /* Must loop back to form a root trace. */
+	lj_trace_err(J, LJ_TRERR_LLEAVE);
+      rec_stop(J, J->curtrace);  /* Root trace forms a loop. */
+    } else if (ev != LOOPEV_LEAVE) {  /* Entering inner loop? */
+      /* It's usually better to abort here and wait until the inner loop
+      ** is traced. But if the inner loop repeatedly didn't loop back,
+      ** this indicates a low trip count. In this case try unrolling
+      ** an inner loop even in a root trace. But it's better to be a bit
+      ** more conservative here and only do it for very short loops.
+      */
+      if (!innerloopleft(J, pc))
+	lj_trace_err(J, LJ_TRERR_LINNER);  /* Root trace hit an inner loop. */
+      if ((J->loopref && J->cur.nins - J->loopref > 8) || --J->loopunroll < 0)
+	lj_trace_err(J, LJ_TRERR_LUNROLL);  /* Limit loop unrolling. */
+      J->loopref = J->cur.nins;
+    }
+  } else if (ev != LOOPEV_LEAVE) {  /* Side trace enters an inner loop. */
+    J->loopref = J->cur.nins;
+    if (--J->loopunroll < 0)
+      lj_trace_err(J, LJ_TRERR_LUNROLL);  /* Limit loop unrolling. */
+  }  /* Side trace continues across a loop that's left or not entered. */
+}
+
+/* Handle the case when an already compiled loop op is hit. */
+static void rec_loop_jit(jit_State *J, TraceNo lnk, LoopEvent ev)
+{
+  if (J->parent == 0) {  /* Root trace hit an inner loop. */
+    /* Better let the inner loop spawn a side trace back here. */
+    lj_trace_err(J, LJ_TRERR_LINNER);
+  } else if (ev != LOOPEV_LEAVE) {  /* Side trace enters a compiled loop. */
+    J->instunroll = 0;  /* Cannot continue across a compiled loop op. */
+    if (J->pc == J->startpc && J->framedepth == 0)
+      lnk = J->curtrace;  /* Can form an extra loop. */
+    rec_stop(J, lnk);  /* Link to the loop. */
+  }  /* Side trace continues across a loop that's left or not entered. */
+}
+
+/* -- Metamethod handling ------------------------------------------------- */
+
+/* Prepare to record call to metamethod. */
+static BCReg rec_mm_prep(jit_State *J, ASMFunction cont)
+{
+  BCReg s, top = curr_proto(J->L)->framesize;
+  TRef trcont;
+  setcont(&J->L->base[top], cont);
+#if LJ_64
+  trcont = lj_ir_kptr(J, (void *)((int64_t)cont - (int64_t)lj_vm_asm_begin));
+#else
+  trcont = lj_ir_kptr(J, (void *)cont);
+#endif
+  J->base[top] = emitir(IRTG(IR_FRAME, IRT_PTR), trcont, trcont);
+  for (s = J->maxslot; s < top; s++)
+    J->base[s] = 0;
+  return top+1;
+}
+
+/* Record metamethod lookup. */
+static int rec_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm)
+{
+  RecordIndex mix;
+  GCtab *mt;
+  if (tref_istab(ix->tab)) {
+    mt = tabref(tabV(&ix->tabv)->metatable);
+    mix.tab = emitir(IRT(IR_FLOAD, IRT_TAB), ix->tab, IRFL_TAB_META);
+  } else if (tref_isudata(ix->tab)) {
+    mt = tabref(udataV(&ix->tabv)->metatable);
+    mix.tab = emitir(IRT(IR_FLOAD, IRT_TAB), ix->tab, IRFL_UDATA_META);
+  } else {
+    /* Specialize to base metatable. Must flush mcode in lua_setmetatable(). */
+    mt = tabref(J2G(J)->basemt[itypemap(&ix->tabv)]);
+    if (mt == NULL)
+      return 0;  /* No metamethod. */
+    mix.tab = lj_ir_ktab(J, mt);
+    goto nocheck;
+  }
+  ix->mt = mix.tab;
+  emitir(IRTG(mt ? IR_NE : IR_EQ, IRT_TAB), mix.tab, lj_ir_knull(J, IRT_TAB));
+nocheck:
+  if (mt) {
+    GCstr *mmstr = strref(J2G(J)->mmname[mm]);
+    cTValue *mo = lj_tab_getstr(mt, mmstr);
+    if (mo && !tvisnil(mo))
+      copyTV(J->L, &ix->mobjv, mo);
+    ix->mtv = mt;
+    settabV(J->L, &mix.tabv, mt);
+    setstrV(J->L, &mix.keyv, mmstr);
+    mix.key = lj_ir_kstr(J, mmstr);
+    mix.val = 0;
+    mix.idxchain = 0;
+    ix->mobj = rec_idx(J, &mix);
+    return !tref_isnil(ix->mobj);  /* 1 if metamethod found, 0 if not. */
+  }
+  return 0;  /* No metamethod. */
+}
+
+/* Record call to arithmetic metamethod (and MM_len). */
+static TRef rec_mm_arith(jit_State *J, RecordIndex *ix, MMS mm)
+{
+  /* Set up metamethod call first to save ix->tab and ix->tabv. */
+  BCReg func = rec_mm_prep(J, lj_cont_ra);
+  TRef *base = J->base + func;
+  TValue *basev = J->L->base + func;
+  base[1] = ix->tab; base[2] = ix->key;
+  copyTV(J->L, basev+1, &ix->tabv);
+  copyTV(J->L, basev+2, &ix->keyv);
+  if (!rec_mm_lookup(J, ix, mm)) {  /* Lookup metamethod on 1st operand. */
+    if (mm != MM_len) {
+      ix->tab = ix->key;
+      copyTV(J->L, &ix->tabv, &ix->keyv);
+      if (rec_mm_lookup(J, ix, mm))  /* Lookup metamethod on 2nd operand. */
+	goto ok;
+    }
+    lj_trace_err(J, LJ_TRERR_NOMM);
+  }
+ok:
+  base[0] = ix->mobj;
+  copyTV(J->L, basev+0, &ix->mobjv);
+  return rec_call(J, func, CALLRES_CONT, 2) ? J->base[func] : 0;
+}
+
+/* Call a comparison metamethod. */
+static void rec_mm_callcomp(jit_State *J, RecordIndex *ix, int op)
+{
+  BCReg func = rec_mm_prep(J, (op&1) ? lj_cont_condf : lj_cont_condt);
+  TRef *base = J->base + func;
+  TValue *tv = J->L->base + func;
+  base[0] = ix->mobj; base[1] = ix->val; base[2] = ix->key;
+  copyTV(J->L, tv+0, &ix->mobjv);
+  copyTV(J->L, tv+1, &ix->valv);
+  copyTV(J->L, tv+2, &ix->keyv);
+  rec_call(J, func, CALLRES_CONT, 2);
+  /* It doesn't matter whether this is immediately resolved or not.
+  ** Type specialization of the return type suffices to specialize
+  ** the control flow.
+  */
+}
+
+/* Record call to equality comparison metamethod (for tab and udata only). */
+static void rec_mm_equal(jit_State *J, RecordIndex *ix, int op)
+{
+  ix->tab = ix->val;
+  copyTV(J->L, &ix->tabv, &ix->valv);
+  if (rec_mm_lookup(J, ix, MM_eq)) {  /* Lookup metamethod on 1st operand. */
+    cTValue *bv;
+    TRef mo1 = ix->mobj;
+    TValue mo1v;
+    copyTV(J->L, &mo1v, &ix->mobjv);
+    /* Avoid the 2nd lookup and the objcmp if the metatables are equal. */
+    bv = &ix->keyv;
+    if (tvistab(bv) && tabref(tabV(bv)->metatable) == ix->mtv) {
+      TRef mt2 = emitir(IRT(IR_FLOAD, IRT_TAB), ix->key, IRFL_TAB_META);
+      emitir(IRTG(IR_EQ, IRT_TAB), mt2, ix->mt);
+    } else if (tvisudata(bv) && tabref(udataV(bv)->metatable) == ix->mtv) {
+      TRef mt2 = emitir(IRT(IR_FLOAD, IRT_TAB), ix->key, IRFL_UDATA_META);
+      emitir(IRTG(IR_EQ, IRT_TAB), mt2, ix->mt);
+    } else {  /* Lookup metamethod on 2nd operand and compare both. */
+      ix->tab = ix->key;
+      copyTV(J->L, &ix->tabv, bv);
+      if (!rec_mm_lookup(J, ix, MM_eq) ||
+	  rec_objcmp(J, mo1, ix->mobj, &mo1v, &ix->mobjv))
+	return;
+    }
+    rec_mm_callcomp(J, ix, op);
+  }
+}
+
+/* Record call to ordered comparison metamethods (for arbitrary objects). */
+static void rec_mm_comp(jit_State *J, RecordIndex *ix, int op)
+{
+  ix->tab = ix->val;
+  copyTV(J->L, &ix->tabv, &ix->valv);
+  while (1) {
+    MMS mm = (op & 2) ? MM_le : MM_lt;  /* Try __le + __lt or only __lt. */
+    if (rec_mm_lookup(J, ix, mm)) {  /* Lookup metamethod on 1st operand. */
+      cTValue *bv;
+      TRef mo1 = ix->mobj;
+      TValue mo1v;
+      copyTV(J->L, &mo1v, &ix->mobjv);
+      /* Avoid the 2nd lookup and the objcmp if the metatables are equal. */
+      bv = &ix->keyv;
+      if (tvistab(bv) && tabref(tabV(bv)->metatable) == ix->mtv) {
+	TRef mt2 = emitir(IRT(IR_FLOAD, IRT_TAB), ix->key, IRFL_TAB_META);
+	emitir(IRTG(IR_EQ, IRT_TAB), mt2, ix->mt);
+      } else if (tvisudata(bv) && tabref(udataV(bv)->metatable) == ix->mtv) {
+	TRef mt2 = emitir(IRT(IR_FLOAD, IRT_TAB), ix->key, IRFL_UDATA_META);
+	emitir(IRTG(IR_EQ, IRT_TAB), mt2, ix->mt);
+      } else {  /* Lookup metamethod on 2nd operand and compare both. */
+	ix->tab = ix->key;
+	copyTV(J->L, &ix->tabv, bv);
+	if (!rec_mm_lookup(J, ix, mm) ||
+	    rec_objcmp(J, mo1, ix->mobj, &mo1v, &ix->mobjv))
+	  goto nomatch;
+      }
+      rec_mm_callcomp(J, ix, op);
+      return;
+    }
+  nomatch:
+    /* First lookup failed. Retry with  __lt and swapped operands. */
+    if (!(op & 2)) break;  /* Already at __lt. Interpreter will throw. */
+    ix->tab = ix->key; ix->key = ix->val; ix->val = ix->tab;
+    copyTV(J->L, &ix->tabv, &ix->keyv);
+    copyTV(J->L, &ix->keyv, &ix->valv);
+    copyTV(J->L, &ix->valv, &ix->tabv);
+    op ^= 3;
+  }
+}
+
+/* -- Indexed access ------------------------------------------------------ */
+
+/* Record indexed key lookup. */
+static TRef rec_idx_key(jit_State *J, RecordIndex *ix)
+{
+  TRef key;
+  GCtab *t = tabV(&ix->tabv);
+  ix->oldv = lj_tab_get(J->L, t, &ix->keyv);  /* Lookup previous value. */
+
+  /* Integer keys are looked up in the array part first. */
+  key = ix->key;
+  if (tref_isnumber(key)) {
+    lua_Number n = numV(&ix->keyv);
+    int32_t k = lj_num2int(n);
+    lua_assert(tvisnum(&ix->keyv));
+    /* Potential array key? */
+    if ((MSize)k < LJ_MAX_ASIZE && n == cast_num(k)) {
+      TRef asizeref, ikey = key;
+      if (!tref_isinteger(ikey))
+	ikey = emitir(IRTGI(IR_TOINT), ikey, IRTOINT_INDEX);
+      asizeref = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_ASIZE);
+      if ((MSize)k < t->asize) {  /* Currently an array key? */
+	TRef arrayref;
+	emitir(IRTGI(IR_ABC), asizeref, ikey);  /* Bounds check. */
+	arrayref = emitir(IRT(IR_FLOAD, IRT_PTR), ix->tab, IRFL_TAB_ARRAY);
+	return emitir(IRT(IR_AREF, IRT_PTR), arrayref, ikey);
+      } else {  /* Currently not in array (may be an array extension)? */
+	emitir(IRTGI(IR_ULE), asizeref, ikey);  /* Inv. bounds check. */
+	if (k == 0 && tref_isk(key))
+	  key = lj_ir_knum_zero(J);  /* Canonicalize 0 or +-0.0 to +0.0. */
+	/* And continue with the hash lookup. */
+      }
+    } else if (!tref_isk(key)) {
+      /* We can rule out const numbers which failed the integerness test
+      ** above. But all other numbers are potential array keys.
+      */
+      if (t->asize == 0) {  /* True sparse tables have an empty array part. */
+	/* Guard that the array part stays empty. */
+	TRef tmp = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_ASIZE);
+	emitir(IRTGI(IR_EQ), tmp, lj_ir_kint(J, 0));
+      } else {
+	lj_trace_err(J, LJ_TRERR_NYITMIX);
+      }
+    }
+  }
+
+  /* Otherwise the key is located in the hash part. */
+  if (tref_isinteger(key))  /* Hash keys are based on numbers, not ints. */
+    ix->key = key = emitir(IRTN(IR_TONUM), key, 0);
+  if (tref_isk(key)) {
+    /* Optimize lookup of constant hash keys. */
+    MSize hslot = (MSize)((char *)ix->oldv - (char *)&noderef(t->node)[0].val);
+    if (t->hmask > 0 && hslot <= t->hmask*(MSize)sizeof(Node) &&
+	hslot <= 65535*(MSize)sizeof(Node)) {
+      TRef node, kslot;
+      TRef hm = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_HMASK);
+      emitir(IRTGI(IR_EQ), hm, lj_ir_kint(J, (int32_t)t->hmask));
+      node = emitir(IRT(IR_FLOAD, IRT_PTR), ix->tab, IRFL_TAB_NODE);
+      kslot = lj_ir_kslot(J, key, hslot / sizeof(Node));
+      return emitir(IRTG(IR_HREFK, IRT_PTR), node, kslot);
+    }
+  }
+  /* Fall back to a regular hash lookup. */
+  return emitir(IRT(IR_HREF, IRT_PTR), ix->tab, key);
+}
+
+/* Determine whether a key is NOT one of the fast metamethod names. */
+static int nommstr(jit_State *J, TRef key)
+{
+  if (tref_isstr(key)) {
+    if (tref_isk(key)) {
+      GCstr *str = ir_kstr(IR(tref_ref(key)));
+      uint32_t i;
+      for (i = 0; i <= MM_FAST; i++)
+	if (strref(J2G(J)->mmname[i]) == str)
+	  return 0;  /* MUST be one the fast metamethod names. */
+    } else {
+      return 0;  /* Variable string key MAY be a metamethod name. */
+    }
+  }
+  return 1;  /* CANNOT be a metamethod name. */
+}
+
+/* Record indexed load/store. */
+static TRef rec_idx(jit_State *J, RecordIndex *ix)
+{
+  TRef xref;
+  IROp xrefop, loadop;
+  cTValue *oldv;
+
+  while (!tref_istab(ix->tab)) { /* Handle non-table lookup. */
+    lua_assert(ix->idxchain != 0); /* Never call raw rec_idx() on non-table. */
+    if (!rec_mm_lookup(J, ix, ix->val ? MM_newindex : MM_index))
+      lj_trace_err(J, LJ_TRERR_NOMM);
+  handlemm:
+    if (tref_isfunc(ix->mobj)) {  /* Handle metamethod call. */
+      BCReg func = rec_mm_prep(J, ix->val ? lj_cont_nop : lj_cont_ra);
+      TRef *base = J->base + func;
+      TValue *tv = J->L->base + func;
+      base[0] = ix->mobj; base[1] = ix->tab; base[2] = ix->key;
+      setfuncV(J->L, tv+0, funcV(&ix->mobjv));
+      copyTV(J->L, tv+1, &ix->tabv);
+      copyTV(J->L, tv+2, &ix->keyv);
+      if (ix->val) {
+	base[3] = ix->val;
+	copyTV(J->L, tv+3, &ix->valv);
+	rec_call(J, func, CALLRES_CONT, 3);  /* mobj(tab, key, val) */
+	return 0;
+      } else {
+	/* res = mobj(tab, key) */
+	return rec_call(J, func, CALLRES_CONT, 2) ? J->base[func] : 0;
+      }
+    }
+    /* Otherwise retry lookup with metaobject. */
+    ix->tab = ix->mobj;
+    copyTV(J->L, &ix->tabv, &ix->mobjv);
+    if (--ix->idxchain == 0)
+      lj_trace_err(J, LJ_TRERR_IDXLOOP);
+  }
+
+  /* First catch nil and NaN keys for tables. */
+  if (tvisnil(&ix->keyv) || (tvisnum(&ix->keyv) && tvisnan(&ix->keyv))) {
+    if (ix->val)  /* Better fail early. */
+      lj_trace_err(J, LJ_TRERR_STORENN);
+    if (tref_isk(ix->key)) {
+      if (ix->idxchain && rec_mm_lookup(J, ix, MM_index))
+	goto handlemm;
+      return TREF_NIL;
+    }
+  }
+
+  /* Record the key lookup. */
+  xref = rec_idx_key(J, ix);
+  xrefop = IR(tref_ref(xref))->o;
+  loadop = xrefop == IR_AREF ? IR_ALOAD : IR_HLOAD;
+  oldv = ix->oldv;
+
+  if (ix->val == 0) {  /* Indexed load */
+    IRType t = itype2irt(oldv);
+    TRef res = emitir(IRTG(loadop, t), xref, 0);
+    if (t == IRT_NIL && ix->idxchain && rec_mm_lookup(J, ix, MM_index))
+      goto handlemm;
+    if (irtype_ispri(t)) res = TREF_PRI(t);  /* Canonicalize primitives. */
+    return res;
+  } else {  /* Indexed store. */
+    GCtab *mt = tabref(tabV(&ix->tabv)->metatable);
+    if (tvisnil(oldv)) {  /* Previous value was nil? */
+      /* Need to duplicate the hasmm check for the early guards. */
+      int hasmm = 0;
+      if (ix->idxchain && mt) {
+	cTValue *mo = lj_tab_getstr(mt, strref(J2G(J)->mmname[MM_newindex]));
+	hasmm = mo && !tvisnil(mo);
+      }
+      if (hasmm || oldv == niltvg(J2G(J)))
+	emitir(IRTG(loadop, IRT_NIL), xref, 0);  /* Guard for nil value. */
+      else if (xrefop == IR_HREF)
+	emitir(IRTG(IR_NE, IRT_PTR), xref, lj_ir_kptr(J, niltvg(J2G(J))));
+      if (ix->idxchain && rec_mm_lookup(J, ix, MM_newindex)) { /* Metamethod? */
+	lua_assert(hasmm);
+	goto handlemm;
+      }
+      lua_assert(!hasmm);
+      if (oldv == niltvg(J2G(J))) {  /* Need to insert a new key. */
+	TRef key = ix->key;
+	if (tref_isinteger(key))  /* NEWREF needs a TValue as a key. */
+	  key = emitir(IRTN(IR_TONUM), key, 0);
+	xref = emitir(IRT(IR_NEWREF, IRT_PTR), ix->tab, key);
+      }
+    } else if (!lj_opt_fwd_wasnonnil(J, loadop, tref_ref(xref))) {
+      /* Cannot derive that the previous value was non-nil, must do checks. */
+      if (xrefop == IR_HREF)  /* Guard against store to niltv. */
+	emitir(IRTG(IR_NE, IRT_PTR), xref, lj_ir_kptr(J, niltvg(J2G(J))));
+      if (ix->idxchain) {  /* Metamethod lookup required? */
+	/* A check for NULL metatable is cheaper (hoistable) than a load. */
+	if (!mt) {
+	  TRef mtref = emitir(IRT(IR_FLOAD, IRT_TAB), ix->tab, IRFL_TAB_META);
+	  emitir(IRTG(IR_EQ, IRT_TAB), mtref, lj_ir_knull(J, IRT_TAB));
+	} else {
+	  IRType t = itype2irt(oldv);
+	  emitir(IRTG(loadop, t), xref, 0);  /* Guard for non-nil value. */
+	}
+      }
+    }
+    if (tref_isinteger(ix->val))  /* Convert int to number before storing. */
+      ix->val = emitir(IRTN(IR_TONUM), ix->val, 0);
+    emitir(IRT(loadop+IRDELTA_L2S, tref_type(ix->val)), xref, ix->val);
+    if (tref_isgcv(ix->val))
+      emitir(IRT(IR_TBAR, IRT_NIL), ix->tab, 0);
+    /* Invalidate neg. metamethod cache for stores with certain string keys. */
+    if (!nommstr(J, ix->key)) {
+      TRef fref = emitir(IRT(IR_FREF, IRT_PTR), ix->tab, IRFL_TAB_NOMM);
+      emitir(IRT(IR_FSTORE, IRT_U8), fref, lj_ir_kint(J, 0));
+    }
+    J->needsnap = 1;
+    return 0;
+  }
+}
+
+/* -- Upvalue access ------------------------------------------------------ */
+
+/* Record upvalue load/store. */
+static TRef rec_upvalue(jit_State *J, uint32_t uv, TRef val)
+{
+  GCupval *uvp = &gcref(J->fn->l.uvptr[uv])->uv;
+  TRef fn = getcurrf(J);
+  IRRef uref;
+  int needbarrier = 0;
+  if (!uvp->closed) {
+    /* In current stack? */
+    if (uvp->v >= J->L->stack && uvp->v < J->L->maxstack) {
+      int32_t slot = (int32_t)(uvp->v - (J->L->base - J->baseslot));
+      if (slot >= 0) {  /* Aliases an SSA slot? */
+	slot -= (int32_t)J->baseslot;  /* Note: slot number may be negative! */
+	/* NYI: add IR to guard that it's still aliasing the same slot. */
+	if (val == 0) {
+	  return getslot(J, slot);
+	} else {
+	  J->base[slot] = val;
+	  if (slot >= (int32_t)J->maxslot) J->maxslot = (BCReg)(slot+1);
+	  return 0;
+	}
+      }
+    }
+    uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_PTR), fn, uv));
+  } else {
+    needbarrier = 1;
+    uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PTR), fn, uv));
+  }
+  if (val == 0) {  /* Upvalue load */
+    IRType t = itype2irt(uvp->v);
+    TRef res = emitir(IRTG(IR_ULOAD, t), uref, 0);
+    if (irtype_ispri(t)) res = TREF_PRI(t);  /* Canonicalize primitive refs. */
+    return res;
+  } else {  /* Upvalue store. */
+    if (tref_isinteger(val))  /* Convert int to number before storing. */
+      val = emitir(IRTN(IR_TONUM), val, 0);
+    emitir(IRT(IR_USTORE, tref_type(val)), uref, val);
+    if (needbarrier && tref_isgcv(val))
+      emitir(IRT(IR_OBAR, IRT_NIL), uref, val);
+    J->needsnap = 1;
+    return 0;
+  }
+}
+
+/* -- Record calls to fast functions -------------------------------------- */
+
+/* Note: The function and the arguments for the bytecode CALL instructions
+** always occupy _new_ stack slots (above the highest active variable).
+** This means they must have been stored there by previous instructions
+** (MOV, K*, ADD etc.) which must be part of the same trace. This in turn
+** means their reference slots are already valid and their types have
+** already been specialized (i.e. getslot() would be redundant).
+** The 1st slot beyond the arguments is set to 0 before calling recff_*.
+*/
+
+/* Data used by handlers to record a fast function. */
+typedef struct RecordFFData {
+  TValue *argv;		/* Runtime argument values. */
+  GCfunc *fn;		/* The currently recorded function. */
+  int nargs;		/* Number of passed arguments. */
+  int nres;		/* Number of returned results (defaults to 1). */
+  int cres;		/* Wanted number of call results. */
+  uint32_t data;	/* Per-ffid auxiliary data (opcode, literal etc.). */
+} RecordFFData;
+
+/* Type of handler to record a fast function. */
+typedef void (*RecordFunc)(jit_State *J, TRef *res, RecordFFData *rd);
+
+/* Avoid carrying two pointers around. */
+#define arg	(res+1)
+
+/* Get runtime value of int argument. */
+static int32_t argv2int(jit_State *J, TValue *o)
+{
+  if (tvisstr(o) && !lj_str_numconv(strVdata(o), o))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  return lj_num2bit(numV(o));
+}
+
+/* Get runtime value of string argument. */
+static GCstr *argv2str(jit_State *J, TValue *o)
+{
+  if (LJ_LIKELY(tvisstr(o))) {
+    return strV(o);
+  } else {
+    GCstr *s;
+    lua_assert(tvisnum(o));
+    s = lj_str_fromnum(J->L, &o->n);
+    setstrV(J->L, o, s);
+    return s;
+  }
+}
+
+/* Fallback handler for all fast functions that are not recorded (yet). */
+static void recff_nyi(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  UNUSED(res);
+  setfuncV(J->L, &J->errinfo, rd->fn);
+  lj_trace_err_info(J, LJ_TRERR_NYIFF);
+}
+
+LJ_NORET static void recff_err_ffu(jit_State *J, RecordFFData *rd)
+{
+  setfuncV(J->L, &J->errinfo, rd->fn);
+  lj_trace_err_info(J, LJ_TRERR_NYIFFU);
+}
+
+/* C functions can have arbitrary side-effects and are not recorded (yet). */
+static void recff_c(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  UNUSED(res);
+  setlightudV(&J->errinfo, (void *)rd->fn->c.f);
+  lj_trace_err_info(J, LJ_TRERR_NYICF);
+}
+
+/* -- Base library fast functions ----------------------------------------- */
+
+static void recff_assert(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  /* Arguments already specialized. The interpreter throws for nil/false. */
+  BCReg i;
+  for (i = 0; arg[i]; i++)  /* Need to pass through all arguments. */
+    res[i] = arg[i];
+  rd->nres = (int)i;
+  UNUSED(J);
+}
+
+static void recff_type(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  /* Arguments already specialized. Result is a constant string. Neat, huh? */
+  IRType t = tref_isinteger(arg[0]) ? IRT_NUM : tref_type(arg[0]);
+  res[0] = lj_ir_kstr(J, strV(&rd->fn->c.upvalue[t]));
+}
+
+static void recff_getmetatable(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = arg[0];
+  if (tref_istab(tr)) {
+    RecordIndex ix;
+    ix.tab = tr;
+    copyTV(J->L, &ix.tabv, &rd->argv[0]);
+    if (rec_mm_lookup(J, &ix, MM_metatable))
+      res[0] = ix.mobj;
+    else
+      res[0] = ix.mt;
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_setmetatable(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = arg[0];
+  TRef mt = arg[1];
+  if (tref_istab(tr) && (tref_istab(mt) || (mt && tref_isnil(mt)))) {
+    TRef fref, mtref;
+    RecordIndex ix;
+    ix.tab = tr;
+    copyTV(J->L, &ix.tabv, &rd->argv[0]);
+    rec_mm_lookup(J, &ix, MM_metatable); /* Guard for no __metatable field. */
+    fref = emitir(IRT(IR_FREF, IRT_PTR), tr, IRFL_TAB_META);
+    mtref = tref_isnil(mt) ? lj_ir_knull(J, IRT_TAB) : mt;
+    emitir(IRT(IR_FSTORE, IRT_TAB), fref, mtref);
+    if (!tref_isnil(mt))
+      emitir(IRT(IR_TBAR, IRT_TAB), tr, 0);
+    res[0] = tr;
+    J->needsnap = 1;
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_rawget(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (tref_istab(arg[0]) && arg[1]) {
+    RecordIndex ix;
+    ix.tab = arg[0]; ix.key = arg[1]; ix.val = 0; ix.idxchain = 0;
+    settabV(J->L, &ix.tabv, tabV(&rd->argv[0]));
+    copyTV(J->L, &ix.keyv, &rd->argv[1]);
+    res[0] = rec_idx(J, &ix);
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_rawset(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (tref_istab(arg[0]) && arg[1] && arg[2]) {
+    RecordIndex ix;
+    ix.tab = arg[0]; ix.key = arg[1]; ix.val = arg[2]; ix.idxchain = 0;
+    settabV(J->L, &ix.tabv, tabV(&rd->argv[0]));
+    copyTV(J->L, &ix.keyv, &rd->argv[1]);
+    copyTV(J->L, &ix.valv, &rd->argv[2]);
+    rec_idx(J, &ix);
+    res[0] = arg[0];  /* Returns table. */
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_rawequal(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (arg[0] && arg[1]) {
+    int diff = rec_objcmp(J, arg[0], arg[1], &rd->argv[0], &rd->argv[1]);
+    res[0] = diff ? TREF_FALSE : TREF_TRUE;
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_tonumber(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = arg[0];
+  if (tref_isnumber_str(tr)) {
+    if (arg[1]) {
+      TRef base = lj_ir_toint(J, arg[1]);
+      if (!tref_isk(base) || IR(tref_ref(base))->i != 10)
+	recff_err_ffu(J, rd);
+    }
+    if (tref_isstr(tr))
+      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+  } else {
+    tr = TREF_NIL;
+  }
+  res[0] = tr;
+  UNUSED(rd);
+}
+
+static void recff_tostring(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = arg[0];
+  if (tref_isstr(tr)) {
+    /* Ignore __tostring in the string base metatable. */
+    res[0] = tr;
+  } else {
+    RecordIndex ix;
+    ix.tab = tr;
+    copyTV(J->L, &ix.tabv, &rd->argv[0]);
+    if (rec_mm_lookup(J, &ix, MM_tostring)) {  /* Has __tostring metamethod? */
+      res[0] = ix.mobj;
+      copyTV(J->L, rd->argv - 1, &ix.mobjv);
+      if (!rec_call(J, (BCReg)(res - J->base), 1, 1))  /* Pending call? */
+	rd->cres = CALLRES_PENDING;
+      /* Otherwise res[0] already contains the result. */
+    } else if (tref_isnumber(tr)) {
+      res[0] = emitir(IRT(IR_TOSTR, IRT_STR), tr, 0);
+    } else {
+      recff_err_ffu(J, rd);
+    }
+  }
+}
+
+static void recff_ipairs_aux(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  RecordIndex ix;
+  ix.tab = arg[0];
+  if (tref_istab(ix.tab)) {
+    if (!tvisnum(&rd->argv[1]))  /* No support for string coercion. */
+      lj_trace_err(J, LJ_TRERR_BADTYPE);
+    setnumV(&ix.keyv, numV(&rd->argv[1])+(lua_Number)1);
+    settabV(J->L, &ix.tabv, tabV(&rd->argv[0]));
+    ix.val = 0; ix.idxchain = 0;
+    ix.key = lj_ir_toint(J, arg[1]);
+    res[0] = ix.key = emitir(IRTI(IR_ADD), ix.key, lj_ir_kint(J, 1));
+    res[1] = rec_idx(J, &ix);
+    rd->nres = tref_isnil(res[1]) ? 0 : 2;
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_ipairs(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tab = arg[0];
+  if (tref_istab(tab)) {
+    res[0] = lj_ir_kfunc(J, funcV(&rd->fn->c.upvalue[0]));
+    res[1] = tab;
+    res[2] = lj_ir_kint(J, 0);
+    rd->nres = 3;
+  }  /* else: Interpreter will throw. */
+}
+
+static void recff_pcall(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (rd->nargs >= 1) {
+    BCReg parg = (BCReg)(arg - J->base);
+    if (rec_call(J, parg, CALLRES_MULTI, rd->nargs - 1)) {  /* Resolved call. */
+      res[0] = TREF_TRUE;  /* Prepend true result. No need to move results. */
+      rd->nres = (int)((J->maxslot - parg) + 1);
+    } else {  /* Propagate pending call. */
+      rd->cres = CALLRES_PENDING;
+    }
+  }  /* else: Interpreter will throw. */
+}
+
+/* Struct to pass context across lj_vm_cpcall. */
+typedef struct RecordXpcall {
+  BCReg parg;
+  int nargs;
+  int resolved;
+} RecordXpcall;
+
+static TValue *recff_xpcall_cp(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  jit_State *J = L2J(L);
+  RecordXpcall *rx = (RecordXpcall *)ud;
+  UNUSED(dummy);
+  rx->resolved = rec_call(J, rx->parg, CALLRES_MULTI, rx->nargs);
+  return NULL;
+}
+
+static void recff_xpcall(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (rd->nargs >= 2) {
+    RecordXpcall rx;
+    BCReg parg = (BCReg)(arg - J->base) + 1;
+    TRef tmp;
+    TValue argv0, argv1;
+    ptrdiff_t oargv;
+    int errcode;
+    /* Swap function and traceback. */
+    tmp = arg[0]; arg[0] = arg[1]; arg[1] = tmp;
+    copyTV(J->L, &argv0, &rd->argv[0]);
+    copyTV(J->L, &argv1, &rd->argv[1]);
+    copyTV(J->L, &rd->argv[0], &argv1);
+    copyTV(J->L, &rd->argv[1], &argv0);
+    oargv = savestack(J->L, rd->argv);
+    /* Need to protect rec_call because the recorder may throw. */
+    rx.parg = parg;
+    rx.nargs = rd->nargs - 2;
+    errcode = lj_vm_cpcall(J->L, recff_xpcall_cp, NULL, &rx);
+    /* Always undo Lua stack swap to avoid confusing the interpreter. */
+    rd->argv = restorestack(J->L, oargv);  /* Stack may have been resized. */
+    copyTV(J->L, &rd->argv[0], &argv0);
+    copyTV(J->L, &rd->argv[1], &argv1);
+    if (errcode)
+      lj_err_throw(J->L, errcode);  /* Propagate errors. */
+    if (rx.resolved) {  /* Resolved call. */
+      int i, nres = (int)(J->maxslot - parg);
+      rd->nres = nres + 1;
+      res[0] = TREF_TRUE;  /* Prepend true result. */
+      for (i = 1; i <= nres; i++)  /* Move results down. */
+	res[i] = res[i+1];
+    } else {  /* Propagate pending call. */
+      rd->cres = CALLRES_PENDING;
+    }
+  }  /* else: Interpreter will throw. */
+}
+
+/* -- Math library fast functions ----------------------------------------- */
+
+static void recff_math_abs(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tonum(J, arg[0]);
+  res[0] = emitir(IRTN(IR_ABS), tr, lj_ir_knum_abs(J));
+  UNUSED(rd);
+}
+
+/* Record rounding functions math.floor and math.ceil. */
+static void recff_math_round(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (tref_isinteger(arg[0]))
+    res[0] = arg[0];
+  else
+    res[0] = emitir(IRTN(IR_FPMATH), lj_ir_tonum(J, arg[0]), rd->data);
+  /* Note: result is integral (or NaN/Inf), but may not fit into an integer. */
+}
+
+/* Record unary math.* functions, mapped to IR_FPMATH opcode. */
+static void recff_math_unary(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  res[0] = emitir(IRTN(IR_FPMATH), lj_ir_tonum(J, arg[0]), rd->data);
+}
+
+/* Record binary math.* functions math.atan2 and math.ldexp. */
+static void recff_math_binary(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tonum(J, arg[0]);
+  res[0] = emitir(IRTN(rd->data), tr, lj_ir_tonum(J, arg[1]));
+}
+
+/* Record math.asin, math.acos, math.atan. */
+static void recff_math_atrig(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef y = lj_ir_tonum(J, arg[0]);
+  TRef x = lj_ir_knum_one(J);
+  uint32_t ffid = rd->data;
+  if (ffid != FF_math_atan) {
+    TRef tmp = emitir(IRTN(IR_MUL), y, y);
+    tmp = emitir(IRTN(IR_SUB), x, tmp);
+    tmp = emitir(IRTN(IR_FPMATH), tmp, IRFPM_SQRT);
+    if (ffid == FF_math_asin) { x = tmp; } else { x = y; y = tmp; }
+  }
+  res[0] = emitir(IRTN(IR_ATAN2), y, x);
+}
+
+static void recff_math_modf(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = arg[0];
+  if (tref_isinteger(arg[0])) {
+    res[0] = tr;
+    res[1] = lj_ir_kint(J, 0);
+  } else {
+    tr = lj_ir_tonum(J, tr);
+    res[0] = emitir(IRTN(IR_FPMATH), tr, IRFPM_TRUNC);
+    res[1] = emitir(IRTN(IR_SUB), tr, res[0]);
+  }
+  rd->nres = 2;
+}
+
+static void recff_math_degrad(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tonum(J, arg[0]);
+  res[0] = emitir(IRTN(IR_MUL), tr, lj_ir_knum(J, numV(&rd->fn->c.upvalue[0])));
+}
+
+static void recff_math_pow(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (!tref_isnumber_str(arg[1]))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  res[0] = lj_opt_narrow_pow(J, lj_ir_tonum(J, arg[0]), arg[1], &rd->argv[1]);
+  UNUSED(rd);
+}
+
+static void recff_math_minmax(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tonum(J, arg[0]);
+  uint32_t op = rd->data;
+  BCReg i;
+  for (i = 1; arg[i]; i++)
+    tr = emitir(IRTN(op), tr, lj_ir_tonum(J, arg[i]));
+  res[0] = tr;
+}
+
+/* -- Bit library fast functions ------------------------------------------ */
+
+/* Record unary bit.tobit, bit.bnot, bit.bswap. */
+static void recff_bit_unary(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tobit(J, arg[0]);
+  res[0] = (rd->data == IR_TOBIT) ? tr : emitir(IRTI(rd->data), tr, 0);
+}
+
+/* Record N-ary bit.band, bit.bor, bit.bxor. */
+static void recff_bit_nary(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tobit(J, arg[0]);
+  uint32_t op = rd->data;
+  BCReg i;
+  for (i = 1; arg[i]; i++)
+    tr = emitir(IRTI(op), tr, lj_ir_tobit(J, arg[i]));
+  res[0] = tr;
+}
+
+/* Record bit shifts. */
+static void recff_bit_shift(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef tr = lj_ir_tobit(J, arg[0]);
+  TRef tsh = lj_ir_tobit(J, arg[1]);
+#if !LJ_TARGET_MASKEDSHIFT
+  if (!tref_isk(tsh))
+    tsh = emitir(IRTI(IR_BAND), tsh, lj_ir_kint(J, 31));
+#endif
+  res[0] = emitir(IRTI(rd->data), tr, tsh);
+}
+
+/* -- String library fast functions --------------------------------------- */
+
+static void recff_string_len(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  res[0] = emitir(IRTI(IR_FLOAD), lj_ir_tostr(J, arg[0]), IRFL_STR_LEN);
+  UNUSED(rd);
+}
+
+/* Handle string.byte (rd->data = 0) and string.sub (rd->data = 1). */
+static void recff_string_range(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  TRef trstr = lj_ir_tostr(J, arg[0]);
+  TRef trlen = emitir(IRTI(IR_FLOAD), trstr, IRFL_STR_LEN);
+  TRef tr0 = lj_ir_kint(J, 0);
+  TRef trstart, trend;
+  GCstr *str = argv2str(J, &rd->argv[0]);
+  int32_t start, end;
+  if (rd->data) {  /* string.sub(str, start [,end]) */
+    trstart = lj_ir_toint(J, arg[1]);
+    trend = tref_isnil(arg[2]) ? lj_ir_kint(J, -1) : lj_ir_toint(J, arg[2]);
+    start = argv2int(J, &rd->argv[1]);
+    end = tref_isnil(arg[2]) ? -1 : argv2int(J, &rd->argv[2]);
+  } else {  /* string.byte(str, [,start [,end]]) */
+    if (arg[1]) {
+      trstart = lj_ir_toint(J, arg[1]);
+      trend = tref_isnil(arg[2]) ? trstart : lj_ir_toint(J, arg[2]);
+      start = argv2int(J, &rd->argv[1]);
+      end = tref_isnil(arg[2]) ? start : argv2int(J, &rd->argv[2]);
+    } else {
+      trend = trstart = lj_ir_kint(J, 1);
+      end = start = 1;
+    }
+  }
+  if (end < 0) {
+    emitir(IRTGI(IR_LT), trend, tr0);
+    trend = emitir(IRTI(IR_ADD), emitir(IRTI(IR_ADD), trlen, trend),
+		   lj_ir_kint(J, 1));
+    end = end+(int32_t)str->len+1;
+  } else if ((MSize)end <= str->len) {
+    emitir(IRTGI(IR_ULE), trend, trlen);
+  } else {
+    emitir(IRTGI(IR_GT), trend, trlen);
+    end = (int32_t)str->len;
+    trend = trlen;
+  }
+  if (start < 0) {
+    emitir(IRTGI(IR_LT), trstart, tr0);
+    trstart = emitir(IRTI(IR_ADD), trlen, trstart);
+    start = start+(int32_t)str->len;
+    emitir(start < 0 ? IRTGI(IR_LT) : IRTGI(IR_GE), trstart, tr0);
+    if (start < 0) {
+      trstart = tr0;
+      start = 0;
+    }
+  } else {
+    if (start == 0) {
+      emitir(IRTGI(IR_EQ), trstart, tr0);
+      trstart = tr0;
+    } else {
+      trstart = emitir(IRTI(IR_ADD), trstart, lj_ir_kint(J, -1));
+      emitir(IRTGI(IR_GE), trstart, tr0);
+      start--;
+    }
+  }
+  if (rd->data) {  /* Return string.sub result. */
+    if (end - start >= 0) {
+      /* Also handle empty range here, to avoid extra traces. */
+      TRef trptr, trslen = emitir(IRTI(IR_SUB), trend, trstart);
+      emitir(IRTGI(IR_GE), trslen, tr0);
+      trptr = emitir(IRT(IR_STRREF, IRT_PTR), trstr, trstart);
+      res[0] = emitir(IRT(IR_SNEW, IRT_STR), trptr, trslen);
+    } else {  /* Range underflow: return empty string. */
+      emitir(IRTGI(IR_LT), trend, trstart);
+      res[0] = lj_ir_kstr(J, lj_str_new(J->L, strdata(str), 0));
+    }
+  } else {  /* Return string.byte result(s). */
+    int32_t i, len = end - start;
+    if (len > 0) {
+      TRef trslen = emitir(IRTI(IR_SUB), trend, trstart);
+      emitir(IRTGI(IR_EQ), trslen, lj_ir_kint(J, len));
+      if (res + len > J->slot + LJ_MAX_JSLOTS)
+	lj_trace_err(J, LJ_TRERR_STACKOV);
+      rd->nres = len;
+      for (i = 0; i < len; i++) {
+	TRef tmp = emitir(IRTI(IR_ADD), trstart, lj_ir_kint(J, i));
+	tmp = emitir(IRT(IR_STRREF, IRT_PTR), trstr, tmp);
+	res[i] = emitir(IRT(IR_XLOAD, IRT_U8), tmp, 0);
+      }
+    } else {  /* Empty range or range underflow: return no results. */
+      emitir(IRTGI(IR_LE), trend, trstart);
+      rd->nres = 0;
+    }
+  }
+}
+
+/* -- Table library fast functions ---------------------------------------- */
+
+static void recff_table_getn(jit_State *J, TRef *res, RecordFFData *rd)
+{
+  if (tref_istab(arg[0])) {
+    res[0] = emitir(IRTI(IR_TLEN), arg[0], 0);
+  }  /* else: Interpreter will throw. */
+  UNUSED(rd);
+}
+
+/* -- Record calls and returns -------------------------------------------- */
+
+#undef arg
+
+#include "lj_recdef.h"
+
+/* Record return. */
+static void rec_ret(jit_State *J, BCReg rbase, int gotresults)
+{
+  TValue *frame = J->L->base - 1;
+  TRef *res = J->base + rbase;
+  J->tailcalled = 0;
+  while (frame_ispcall(frame)) {
+    BCReg cbase = (BCReg)frame_delta(frame);
+    lua_assert(J->baseslot > 1);
+    J->baseslot -= (BCReg)cbase;
+    J->base -= cbase;
+    *--res = TREF_TRUE;  /* Prepend true to results. */
+    gotresults++;
+    J->framedepth--;
+    frame = frame_prevd(frame);
+  }
+  if (J->framedepth-- <= 0)
+    lj_trace_err(J, LJ_TRERR_NYIRETL);
+  lua_assert(J->baseslot > 1);
+  if (frame_islua(frame)) {
+    BCIns callins = *(J->pc = frame_pc(frame)-1);
+    ptrdiff_t nresults = bc_b(callins) ? (int)bc_b(callins)-1 : gotresults;
+    BCReg cbase = bc_a(callins);
+    int i;
+    for (i = 0; i < nresults; i++)
+      J->base[i-1] = i < gotresults ? res[i] : TREF_NIL;
+    J->maxslot = cbase+(BCReg)nresults;
+    J->baseslot -= cbase+1;
+    J->base -= cbase+1;
+  } else if (frame_iscont(frame)) {
+    ASMFunction cont = frame_contf(frame);
+    BCReg i, cbase = (BCReg)frame_delta(frame);
+    J->pc = frame_contpc(frame)-1;
+    J->baseslot -= (BCReg)cbase;
+    J->base -= cbase;
+    /* Shrink maxslot as much as possible after return from continuation. */
+    for (i = cbase-2; i > 0 && J->base[i] == 0; i--) ;
+    J->maxslot = i;
+    if (cont == lj_cont_ra) {
+      /* Copy result to destination slot. */
+      BCReg dst = bc_a(*J->pc);
+      J->base[dst] = res[0];
+      if (dst > J->maxslot) J->maxslot = dst+1;
+    } else if (cont == lj_cont_nop) {
+      /* Nothing to do here. */
+    } else if (cont == lj_cont_cat) {
+      lua_assert(0);
+    } else {
+      /* Result type already specialized. */
+      lua_assert(cont == lj_cont_condf || cont == lj_cont_condt);
+    }
+  } else {
+    lua_assert(0);
+  }
+  lua_assert(J->baseslot >= 1);
+}
+
+/* Check unroll limits for calls. */
+static void check_call_unroll(jit_State *J, GCfunc *fn)
+{
+  TValue *first = J->L->base - J->baseslot;
+  TValue *frame = J->L->base - 1;
+  int count = 0;
+  while (frame > first) {
+    if (frame_func(frame) == fn)
+      count++;
+    if (frame_isvarg(frame))
+      frame = frame_prevd(frame);
+    frame = frame_prev(frame);
+  }
+  if (frame_func(first) == fn && bc_op(J->cur.startins) == BC_CALL) {
+    if (count >= J->param[JIT_P_recunroll])
+      lj_trace_err(J, LJ_TRERR_NYIRECU);
+  } else {
+    if (count >= J->param[JIT_P_callunroll])
+      lj_trace_err(J, LJ_TRERR_CUNROLL);
+  }
+}
+
+/* Record call. Returns 0 for pending calls and 1 for resolved calls. */
+static int rec_call(jit_State *J, BCReg func, int cres, int nargs)
+{
+  RecordFFData rd;
+  TRef *res = &J->base[func];
+  TValue *tv = &J->L->base[func];
+
+  if (tref_isfunc(res[0])) {  /* Regular function call. */
+    rd.fn = funcV(tv);
+    rd.argv = tv+1;
+  } else {  /* Otherwise resolve __call metamethod for called object. */
+    RecordIndex ix;
+    int i;
+    ix.tab = res[0];
+    copyTV(J->L, &ix.tabv, tv);
+    if (!rec_mm_lookup(J, &ix, MM_call) || !tref_isfunc(ix.mobj))
+      lj_trace_err(J, LJ_TRERR_NOMM);
+    /* Update the recorder state, but not the Lua stack. */
+    for (i = ++nargs; i > 0; i--)
+      res[i] = res[i-1];
+    res[0] = ix.mobj;
+    rd.fn = funcV(&ix.mobjv);
+    rd.argv = tv;  /* The called object is the 1st arg. */
+  }
+
+  /* Specialize to the runtime value of the called function. */
+  res[0] = emitir(IRTG(IR_FRAME, IRT_FUNC), res[0], lj_ir_kfunc(J, rd.fn));
+
+  if (isluafunc(rd.fn)) {  /* Record call to Lua function. */
+    GCproto *pt = funcproto(rd.fn);
+    if ((pt->flags & PROTO_NO_JIT))
+      lj_trace_err(J, LJ_TRERR_CJITOFF);
+    if ((pt->flags & PROTO_IS_VARARG)) {
+      if (rd.fn->l.gate != lj_gate_lv)
+	lj_trace_err(J, LJ_TRERR_NYILNKF);
+      lj_trace_err(J, LJ_TRERR_NYIVF);
+    } else {
+      if (rd.fn->l.gate != lj_gate_lf)
+	lj_trace_err(J, LJ_TRERR_NYILNKF);
+    }
+    check_call_unroll(J, rd.fn);
+    if (cres == CALLRES_TAILCALL) {
+      int i;
+      /* Tailcalls can form a loop, so count towards the loop unroll limit. */
+      if (++J->tailcalled > J->loopunroll)
+	lj_trace_err(J, LJ_TRERR_LUNROLL);
+      for (i = 0; i <= nargs; i++)  /* Move func + args down. */
+	J->base[i-1] = res[i];
+      /* Note: the new FRAME is now at J->base[-1] (even for slot #0). */
+    } else {  /* Regular call. */
+      J->base += func+1;
+      J->baseslot += func+1;
+      J->framedepth++;
+    }
+    if (J->baseslot + pt->framesize >= LJ_MAX_JSLOTS)
+      lj_trace_err(J, LJ_TRERR_STACKOV);
+    /* Fill up missing args with nil. */
+    while (nargs < pt->numparams)
+      J->base[nargs++] = TREF_NIL;
+    /* The remaining slots should never be read before they are written. */
+    J->maxslot = pt->numparams;
+    return 0;  /* No result yet. */
+  } else {  /* Record call to C function or fast function. */
+    uint32_t m = 0;
+    res[1+nargs] = 0;
+    rd.nargs = nargs;
+    if (rd.fn->c.ffid < sizeof(recff_idmap)/sizeof(recff_idmap[0]))
+      m = recff_idmap[rd.fn->c.ffid];
+    rd.data = m & 0xff;
+    rd.cres = cres;
+    rd.nres = 1;  /* Default is one result. */
+    (recff_func[m >> 8])(J, res, &rd);  /* Call recff_* handler. */
+    cres = rd.cres;
+    if (cres >= 0) {
+      /* Caller takes fixed number of results: local a,b = f() */
+      J->maxslot = func + (BCReg)cres;
+      while (rd.nres < cres)  /* Fill up missing results with nil. */
+	res[rd.nres++] = TREF_NIL;
+    } else if (cres == CALLRES_MULTI) {
+      /* Caller takes any number of results: return 1,f() */
+      J->maxslot = func + (BCReg)rd.nres;
+    } else if (cres == CALLRES_TAILCALL) {
+      /* Tail call: return f() */
+      rec_ret(J, func, rd.nres);
+    } else if (cres == CALLRES_CONT) {
+      /* Note: immediately resolved continuations must not change J->maxslot. */
+      res[rd.nres] = TREF_NIL;  /* Turn 0 results into nil result. */
+    } else {
+      J->framedepth++;
+      lua_assert(cres == CALLRES_PENDING);
+      return 0;  /* Pending call, no result yet. */
+    }
+    return 1;  /* Result resolved immediately. */
+  }
+}
+
+/* -- Record allocations -------------------------------------------------- */
+
+static TRef rec_tnew(jit_State *J, uint32_t ah)
+{
+  uint32_t asize = ah & 0x7ff;
+  uint32_t hbits = ah >> 11;
+  if (asize == 0x7ff) asize = 0x801;
+  return emitir(IRT(IR_TNEW, IRT_TAB), asize, hbits);
+}
+
+/* -- Record bytecode ops ------------------------------------------------- */
+
+/* Optimize state after comparison. */
+static void optstate_comp(jit_State *J, int cond)
+{
+  BCIns jmpins = J->pc[1];
+  const BCIns *npc = J->pc + 2 + (cond ? bc_j(jmpins) : 0);
+  SnapShot *snap = &J->cur.snap[J->cur.nsnap-1];
+  /* Avoid re-recording the comparison in side traces. */
+  J->cur.snapmap[snap->mapofs + snap->nslots] = u32ptr(npc);
+  J->needsnap = 1;
+  /* Shrink last snapshot if possible. */
+  if (bc_a(jmpins) < J->maxslot) {
+    J->maxslot = bc_a(jmpins);
+    lj_snap_shrink(J);
+  }
+}
+
+/* Record the next bytecode instruction (_before_ it's executed). */
+void lj_record_ins(jit_State *J)
+{
+  cTValue *lbase;
+  RecordIndex ix;
+  const BCIns *pc;
+  BCIns ins;
+  BCOp op;
+  TRef ra, rb, rc;
+
+  /* Need snapshot before recording next bytecode (e.g. after a store). */
+  if (J->needsnap) {
+    J->needsnap = 0;
+    lj_snap_add(J);
+    J->mergesnap = 1;
+  }
+
+  /* Record only closed loops for root traces. */
+  pc = J->pc;
+  if (J->framedepth == 0 &&
+     (MSize)((char *)pc - (char *)J->bc_min) >= J->bc_extent)
+    lj_trace_err(J, LJ_TRERR_LLEAVE);
+
+#ifdef LUA_USE_ASSERT
+  rec_check_slots(J);
+  rec_check_ir(J);
+#endif
+
+  /* Keep a copy of the runtime values of var/num/str operands. */
+#define rav	(&ix.valv)
+#define rbv	(&ix.tabv)
+#define rcv	(&ix.keyv)
+
+  lbase = J->L->base;
+  ins = *pc;
+  op = bc_op(ins);
+  ra = bc_a(ins);
+  ix.val = 0;
+  switch (bcmode_a(op)) {
+  case BCMvar:
+    copyTV(J->L, rav, &lbase[ra]); ix.val = ra = getslot(J, ra); break;
+  default: break;  /* Handled later. */
+  }
+  rb = bc_b(ins);
+  rc = bc_c(ins);
+  switch (bcmode_b(op)) {
+  case BCMnone: rb = 0; rc = bc_d(ins); break;  /* Upgrade rc to 'rd'. */
+  case BCMvar:
+    copyTV(J->L, rbv, &lbase[rb]); ix.tab = rb = getslot(J, rb); break;
+  case BCMnum: { lua_Number n = J->pt->k.n[rb];
+    setnumV(rbv, n); ix.tab = rb = lj_ir_knumint(J, n); } break;
+  default: break;  /* Handled later. */
+  }
+  switch (bcmode_c(op)) {
+  case BCMvar:
+    copyTV(J->L, rcv, &lbase[rc]); ix.key = rc = getslot(J, rc); break;
+  case BCMpri: setitype(rcv, (int32_t)~rc); rc = TREF_PRI(IRT_NIL+rc); break;
+  case BCMnum: { lua_Number n = J->pt->k.n[rc];
+    setnumV(rcv, n); ix.key = rc = lj_ir_knumint(J, n); } break;
+  case BCMstr: { GCstr *s = strref(J->pt->k.gc[~rc]);
+    setstrV(J->L, rcv, s); ix.key = rc = lj_ir_kstr(J, s); } break;
+  default: break;  /* Handled later. */
+  }
+
+  switch (op) {
+
+  /* -- Comparison ops ---------------------------------------------------- */
+
+  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
+    /* Emit nothing for two numeric or string consts. */
+    if (!(tref_isk2(ra,rc) && tref_isnumber_str(ra) && tref_isnumber_str(rc))) {
+      IRType ta = tref_type(ra);
+      IRType tc = tref_type(rc);
+      int irop;
+      if (ta != tc) {
+	/* Widen mixed number/int comparisons to number/number comparison. */
+	if (ta == IRT_INT && tc == IRT_NUM) {
+	  ra = emitir(IRTN(IR_TONUM), ra, 0);
+	  ta = IRT_NUM;
+	} else if (ta == IRT_NUM && tc == IRT_INT) {
+	  rc = emitir(IRTN(IR_TONUM), rc, 0);
+	} else if (!((ta == IRT_FALSE || ta == IRT_TRUE) &&
+		     (tc == IRT_FALSE || tc == IRT_TRUE))) {
+	  break;  /* Interpreter will throw for two different types. */
+	}
+      }
+      lj_snap_add(J);
+      irop = (int)op - (int)BC_ISLT + (int)IR_LT;
+      if (ta == IRT_NUM) {
+	if ((irop & 1)) irop ^= 4;  /* ISGE/ISGT are unordered. */
+	if (!lj_ir_numcmp(numV(rav), numV(rcv), (IROp)irop)) irop ^= 5;
+      } else if (ta == IRT_INT) {
+	if (!lj_ir_numcmp(numV(rav), numV(rcv), (IROp)irop)) irop ^= 1;
+      } else if (ta == IRT_STR) {
+	if (!lj_ir_strcmp(strV(rav), strV(rcv), (IROp)irop)) irop ^= 1;
+      } else {
+	rec_mm_comp(J, &ix, (int)op);
+	break;
+      }
+      emitir(IRTG(irop, ta), ra, rc);
+      optstate_comp(J, ((int)op ^ irop) & 1);
+    }
+    break;
+
+  case BC_ISEQV: case BC_ISNEV:
+  case BC_ISEQS: case BC_ISNES:
+  case BC_ISEQN: case BC_ISNEN:
+  case BC_ISEQP: case BC_ISNEP:
+    /* Emit nothing for two non-table, non-udata consts. */
+    if (!(tref_isk2(ra, rc) && !(tref_istab(ra) || tref_isudata(ra)))) {
+      int diff;
+      lj_snap_add(J);
+      diff = rec_objcmp(J, ra, rc, rav, rcv);
+      if (diff == 1 && (tref_istab(ra) || tref_isudata(ra))) {
+	/* Only check __eq if different, but the same type (table or udata). */
+	rec_mm_equal(J, &ix, (int)op);
+	break;
+      }
+      optstate_comp(J, ((int)op & 1) == !diff);
+    }
+    break;
+
+  /* -- Unary test and copy ops ------------------------------------------- */
+
+  case BC_ISTC: case BC_ISFC:
+    if ((op & 1) == tref_istruecond(rc))
+      rc = 0;  /* Don't store if condition is not true. */
+    /* fallthrough */
+  case BC_IST: case BC_ISF:  /* Type specialization suffices. */
+    if (bc_a(pc[1]) < J->maxslot)
+      J->maxslot = bc_a(pc[1]);  /* Shrink used slots. */
+    break;
+
+  /* -- Unary ops --------------------------------------------------------- */
+
+  case BC_NOT:
+    /* Type specialization already forces const result. */
+    rc = tref_istruecond(rc) ? TREF_FALSE : TREF_TRUE;
+    break;
+
+  case BC_LEN:
+    if (tref_isstr(rc)) {
+      rc = emitir(IRTI(IR_FLOAD), rc, IRFL_STR_LEN);
+    } else if (tref_istab(rc)) {
+      rc = emitir(IRTI(IR_TLEN), rc, 0);
+    } else {
+      ix.tab = rc;
+      copyTV(J->L, &ix.tabv, &ix.keyv);
+      ix.key = IRT_NIL;
+      setnilV(&ix.keyv);
+      rc = rec_mm_arith(J, &ix, MM_len);
+    }
+    break;
+
+  /* -- Arithmetic ops ---------------------------------------------------- */
+
+  case BC_UNM:
+    if (tref_isnumber_str(rc)) {
+      rc = lj_ir_tonum(J, rc);
+      rc = emitir(IRTN(IR_NEG), rc, lj_ir_knum_neg(J));
+    } else {
+      ix.tab = rc;
+      copyTV(J->L, &ix.tabv, &ix.keyv);
+      rc = rec_mm_arith(J, &ix, MM_unm);
+    }
+    break;
+
+  case BC_ADDNV: case BC_SUBNV: case BC_MULNV: case BC_DIVNV: case BC_MODNV:
+    ix.tab = rc; ix.key = rc = rb; rb = ix.tab;
+    copyTV(J->L, &ix.valv, &ix.tabv);
+    copyTV(J->L, &ix.tabv, &ix.keyv);
+    copyTV(J->L, &ix.keyv, &ix.valv);
+    if (op == BC_MODNV)
+      goto recmod;
+    /* fallthrough */
+  case BC_ADDVN: case BC_SUBVN: case BC_MULVN: case BC_DIVVN:
+  case BC_ADDVV: case BC_SUBVV: case BC_MULVV: case BC_DIVVV: {
+    MMS mm = bcmode_mm(op);
+    if (tref_isnumber_str(rb) && tref_isnumber_str(rc)) {
+      rb = lj_ir_tonum(J, rb);
+      rc = lj_ir_tonum(J, rc);
+      rc = emitir(IRTN((int)mm - (int)MM_add + (int)IR_ADD), rb, rc);
+    } else {
+      rc = rec_mm_arith(J, &ix, mm);
+    }
+    break;
+    }
+
+  case BC_MODVN: case BC_MODVV:
+  recmod:
+    if (tref_isnumber_str(rb) && tref_isnumber_str(rc))
+      rc = lj_opt_narrow_mod(J, rb, rc);
+    else
+      rc = rec_mm_arith(J, &ix, MM_mod);
+    break;
+
+  case BC_POW:
+    if (tref_isnumber_str(rb) && tref_isnumber_str(rc))
+      rc = lj_opt_narrow_pow(J, lj_ir_tonum(J, rb), rc, rcv);
+    else
+      rc = rec_mm_arith(J, &ix, MM_pow);
+    break;
+
+  /* -- Constant and move ops --------------------------------------------- */
+
+  case BC_KSTR: case BC_KNUM: case BC_KPRI: case BC_MOV:
+    break;
+  case BC_KSHORT:
+    rc = lj_ir_kint(J, (int32_t)(int16_t)rc);
+    break;
+  case BC_KNIL:
+    while (ra <= rc)
+      J->base[ra++] = TREF_NIL;
+    if (rc >= J->maxslot) J->maxslot = rc+1;
+    break;
+
+  /* -- Upvalue and function ops ------------------------------------------ */
+
+  case BC_UGET:
+    rc = rec_upvalue(J, rc, 0);
+    break;
+  case BC_USETV: case BC_USETS: case BC_USETN: case BC_USETP:
+    rec_upvalue(J, ra, rc);
+    break;
+
+  /* -- Table ops --------------------------------------------------------- */
+
+  case BC_GGET: case BC_GSET:
+    settabV(J->L, &ix.tabv, tabref(J->fn->l.env));
+    ix.tab = emitir(IRT(IR_FLOAD, IRT_TAB), getcurrf(J), IRFL_FUNC_ENV);
+    ix.idxchain = LJ_MAX_IDXCHAIN;
+    rc = rec_idx(J, &ix);
+    break;
+
+  case BC_TGETB: case BC_TSETB:
+    setintV(&ix.keyv, (int32_t)rc);
+    ix.key = lj_ir_kint(J, (int32_t)rc);
+    /* fallthrough */
+  case BC_TGETV: case BC_TGETS: case BC_TSETV: case BC_TSETS:
+    ix.idxchain = LJ_MAX_IDXCHAIN;
+    rc = rec_idx(J, &ix);
+    break;
+
+  case BC_TNEW:
+    rc = rec_tnew(J, rc);
+    break;
+  case BC_TDUP:
+    rc = emitir(IRT(IR_TDUP, IRT_TAB),
+		lj_ir_ktab(J, tabref(J->pt->k.gc[~rc])), 0);
+    break;
+
+  /* -- Calls and vararg handling ----------------------------------------- */
+
+  case BC_ITERC:
+    J->base[ra] = getslot(J, ra-3);
+    J->base[ra+1] = getslot(J, ra-2);
+    J->base[ra+2] = getslot(J, ra-1);
+    { /* Have to do the actual copy now because rec_call needs the values. */
+      TValue *b = &J->L->base[ra];
+      copyTV(J->L, b, b-3);
+      copyTV(J->L, b+1, b-2);
+      copyTV(J->L, b+2, b-1);
+    }
+    goto callop;
+
+  case BC_CALLMT:
+    rb = (TRef)(CALLRES_TAILCALL+1);
+    /* fallthrough */
+  case BC_CALLM:
+    /* L->top is set to L->base+ra+rc+NRESULTS-1+1, see lj_dispatch_ins(). */
+    rc = (BCReg)(J->L->top - J->L->base) - ra;
+    goto callop;
+
+  case BC_CALLT:
+    rb = (TRef)(CALLRES_TAILCALL+1);
+    /* fallthrough */
+  case BC_CALL:
+  callop:
+    if (rb == (TRef)(CALLRES_TAILCALL+1)) {  /* Tail call. */
+    }
+    rec_call(J, ra, (int)(rb-1), (int)(rc-1));
+    break;
+
+  /* -- Returns ----------------------------------------------------------- */
+
+  case BC_RETM:
+    /* L->top is set to L->base+ra+rc+NRESULTS-1, see lj_dispatch_ins(). */
+    rc = (BCReg)(J->L->top - J->L->base) - ra + 1;
+    /* fallthrough */
+  case BC_RET: case BC_RET0: case BC_RET1:
+    rec_ret(J, ra, (int)(rc-1));
+    break;
+
+  /* -- Loops and branches ------------------------------------------------ */
+
+  case BC_FORI:
+    if (rec_for(J, pc, 0) != LOOPEV_LEAVE)
+      J->loopref = J->cur.nins;
+    break;
+  case BC_JFORI:
+    lua_assert(bc_op(pc[(ptrdiff_t)rc-BCBIAS_J]) == BC_JFORL);
+    if (rec_for(J, pc, 0) != LOOPEV_LEAVE)  /* Link to existing loop. */
+      rec_stop(J, bc_d(pc[(ptrdiff_t)rc-BCBIAS_J]));
+    /* Continue tracing if the loop is not entered. */
+    break;
+
+  case BC_FORL:
+    rec_loop_interp(J, pc, rec_for(J, pc+((ptrdiff_t)rc-BCBIAS_J), 1));
+    break;
+  case BC_ITERL:
+    rec_loop_interp(J, pc, rec_iterl(J, *pc));
+    break;
+  case BC_LOOP:
+    rec_loop_interp(J, pc, rec_loop(J, ra));
+    break;
+
+  case BC_JFORL:
+    rec_loop_jit(J, rc, rec_for(J, pc+bc_j(J->trace[rc]->startins), 1));
+    break;
+  case BC_JITERL:
+    rec_loop_jit(J, rc, rec_iterl(J, J->trace[rc]->startins));
+    break;
+  case BC_JLOOP:
+    rec_loop_jit(J, rc, rec_loop(J, ra));
+    break;
+
+  case BC_IFORL:
+  case BC_IITERL:
+  case BC_ILOOP:
+    lj_trace_err_info(J, LJ_TRERR_LBLACKL);
+    break;
+
+  case BC_JMP:
+    if (ra < J->maxslot)
+      J->maxslot = ra;  /* Shrink used slots. */
+    break;
+
+  case BC_CAT:
+  case BC_UCLO:
+  case BC_FNEW:
+  case BC_TSETM:
+  case BC_VARG:
+  default:
+    setintV(&J->errinfo, (int32_t)op);
+    lj_trace_err_info(J, LJ_TRERR_NYIBC);
+    break;
+  }
+
+  /* rc == 0 if we have no result yet, e.g. pending __index metamethod call. */
+  if (bcmode_a(op) == BCMdst && rc) {
+    J->base[ra] = rc;
+    if (ra >= J->maxslot) J->maxslot = ra+1;
+  }
+
+#undef rav
+#undef rbv
+#undef rcv
+
+  /* Limit the number of recorded IR instructions. */
+  if (J->cur.nins > REF_FIRST+(IRRef)J->param[JIT_P_maxrecord])
+    lj_trace_err(J, LJ_TRERR_TRACEOV);
+}
+
+/* -- Recording setup ----------------------------------------------------- */
+
+/* Setup recording for a FORL loop. */
+static void rec_setup_forl(jit_State *J, const BCIns *fori)
+{
+  BCReg ra = bc_a(*fori);
+  cTValue *forbase = &J->L->base[ra];
+  IRType t = (J->flags & JIT_F_OPT_NARROW) ? lj_opt_narrow_forl(forbase)
+					   : IRT_NUM;
+  TRef stop = fori_arg(J, fori-2, ra+FORL_STOP, t);
+  TRef step = fori_arg(J, fori-1, ra+FORL_STEP, t);
+  int dir = (0 <= numV(&forbase[FORL_STEP]));
+  lua_assert(bc_op(*fori) == BC_FORI || bc_op(*fori) == BC_JFORI);
+  if (!tref_isk(step)) {
+    /* Non-constant step: need a guard for the direction. */
+    TRef zero = (t == IRT_INT) ? lj_ir_kint(J, 0) : lj_ir_knum_zero(J);
+    emitir(IRTG(dir ? IR_GE : IR_LT, t), step, zero);
+    /* Add hoistable overflow checks for a narrowed FORL index. */
+    if (t == IRT_INT) {
+      if (tref_isk(stop)) {
+	/* Constant stop: optimize check away or to a range check for step. */
+	int32_t k = IR(tref_ref(stop))->i;
+	if (dir) {
+	  if (k > 0)
+	    emitir(IRTGI(IR_LE), step, lj_ir_kint(J, (int32_t)0x7fffffff-k));
+	} else {
+	  if (k < 0)
+	    emitir(IRTGI(IR_GE), step, lj_ir_kint(J, (int32_t)0x80000000-k));
+	}
+      } else {
+	/* Stop+step variable: need full overflow check (with dead result). */
+	emitir(IRTGI(IR_ADDOV), step, stop);
+      }
+    }
+  } else if (t == IRT_INT && !tref_isk(stop)) {
+    /* Constant step: optimize overflow check to a range check for stop. */
+    int32_t k = IR(tref_ref(step))->i;
+    k = (int32_t)(dir ? 0x7fffffff : 0x80000000) - k;
+    emitir(IRTGI(dir ? IR_LE : IR_GE), stop, lj_ir_kint(J, k));
+  }
+  J->base[ra+FORL_EXT] = sloadt(J, (int32_t)(ra+FORL_IDX), t, IRSLOAD_INHERIT);
+  J->maxslot = ra+FORL_EXT+1;
+}
+
+/* Setup recording for a root trace started by a hot loop. */
+static const BCIns *rec_setup_root(jit_State *J)
+{
+  /* Determine the next PC and the bytecode range for the loop. */
+  const BCIns *pcj, *pc = J->pc;
+  BCIns ins = *pc;
+  BCReg ra = bc_a(ins);
+  switch (bc_op(ins)) {
+  case BC_FORL:
+    J->bc_extent = (MSize)(-bc_j(ins))*sizeof(BCIns);
+    pc += 1+bc_j(ins);
+    J->bc_min = pc;
+    break;
+  case BC_ITERL:
+    lua_assert(bc_op(pc[-1]) == BC_ITERC);
+    J->maxslot = ra + bc_b(pc[-1]) - 1;
+    J->bc_extent = (MSize)(-bc_j(ins))*sizeof(BCIns);
+    pc += 1+bc_j(ins);
+    lua_assert(bc_op(pc[-1]) == BC_JMP);
+    J->bc_min = pc;
+    break;
+  case BC_LOOP:
+    /* Only check BC range for real loops, but not for "repeat until true". */
+    pcj = pc + bc_j(ins);
+    ins = *pcj;
+    if (bc_op(ins) == BC_JMP && bc_j(ins) < 0) {
+      J->bc_min = pcj+1 + bc_j(ins);
+      J->bc_extent = (MSize)(-bc_j(ins))*sizeof(BCIns);
+    }
+    J->maxslot = ra;
+    pc++;
+    break;
+  default:
+    lua_assert(0);
+    break;
+  }
+  return pc;
+}
+
+/* Setup recording for a side trace. */
+static void rec_setup_side(jit_State *J, Trace *T)
+{
+  SnapShot *snap = &T->snap[J->exitno];
+  IRRef2 *map = &T->snapmap[snap->mapofs];
+  BCReg s, nslots = snap->nslots;
+  BloomFilter seen = 0;
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (ref) {
+      IRIns *ir = &T->ir[ref];
+      TRef tr = 0;
+      /* The bloom filter avoids O(nslots^2) overhead for de-duping slots. */
+      if (bloomtest(seen, ref)) {
+	BCReg j;
+	for (j = 0; j < s; j++)
+	  if (snap_ref(map[j]) == ref) {
+	    if (ir->o == IR_FRAME && irt_isfunc(ir->t))
+	      J->baseslot = s+1;
+	    tr = J->slot[j];
+	    goto dupslot;
+	  }
+      }
+      bloomset(seen, ref);
+      switch ((IROp)ir->o) {
+      case IR_KPRI: tr = TREF_PRI(irt_type(ir->t)); break;
+      case IR_KINT: tr = lj_ir_kint(J, ir->i); break;
+      case IR_KGC:  tr = lj_ir_kgc(J, ir_kgc(ir), irt_t(ir->t)); break;
+      case IR_KNUM: tr = lj_ir_knum_addr(J, ir_knum(ir)); break;
+      case IR_FRAME:  /* Placeholder FRAMEs don't need a guard. */
+	if (irt_isfunc(ir->t)) {
+	  J->baseslot = s+1;
+	  J->framedepth++;
+	  tr = lj_ir_kfunc(J, ir_kfunc(&T->ir[ir->op2]));
+	  tr = emitir_raw(IRT(IR_FRAME, IRT_FUNC), tr, tr);
+	} else {
+	  tr = lj_ir_kptr(J, mref(T->ir[ir->op2].ptr, void));
+	  tr = emitir_raw(IRT(IR_FRAME, IRT_PTR), tr, tr);
+	}
+	break;
+      case IR_SLOAD:  /* Inherited SLOADs don't need a guard. */
+	tr = emitir_raw(ir->ot & ~IRT_GUARD, s,
+	       (ir->op2&IRSLOAD_READONLY) | IRSLOAD_INHERIT|IRSLOAD_PARENT);
+	break;
+      default:  /* Parent refs are already typed and don't need a guard. */
+	tr = emitir_raw(IRT(IR_SLOAD, irt_type(ir->t)), s,
+			IRSLOAD_INHERIT|IRSLOAD_PARENT);
+	break;
+      }
+    dupslot:
+      J->slot[s] = tr;
+    }
+  }
+  J->base = J->slot + J->baseslot;
+  J->maxslot = nslots - J->baseslot;
+  lj_snap_add(J);
+}
+
+/* Setup for recording a new trace. */
+void lj_record_setup(jit_State *J)
+{
+  uint32_t i;
+
+  /* Initialize state related to current trace. */
+  memset(J->slot, 0, sizeof(J->slot));
+  memset(J->chain, 0, sizeof(J->chain));
+  memset(J->bpropcache, 0, sizeof(J->bpropcache));
+
+  J->baseslot = 1;  /* Invoking function is at base[-1]. */
+  J->base = J->slot + J->baseslot;
+  J->maxslot = 0;
+  J->framedepth = 0;
+
+  J->instunroll = J->param[JIT_P_instunroll];
+  J->loopunroll = J->param[JIT_P_loopunroll];
+  J->tailcalled = 0;
+  J->loopref = 0;
+
+  J->bc_min = NULL;  /* Means no limit. */
+  J->bc_extent = ~(MSize)0;
+
+  /* Emit instructions for fixed references. Also triggers initial IR alloc. */
+  emitir_raw(IRT(IR_BASE, IRT_PTR), J->parent, J->exitno);
+  for (i = 0; i <= 2; i++) {
+    IRIns *ir = IR(REF_NIL-i);
+    ir->i = 0;
+    ir->t.irt = (uint8_t)(IRT_NIL+i);
+    ir->o = IR_KPRI;
+    ir->prev = 0;
+  }
+  J->cur.nk = REF_TRUE;
+
+  setgcref(J->cur.startpt, obj2gco(J->pt));
+  J->startpc = J->pc;
+  if (J->parent) {  /* Side trace. */
+    Trace *T = J->trace[J->parent];
+    TraceNo root = T->root ? T->root : J->parent;
+    J->cur.root = (uint16_t)root;
+    J->cur.startins = BCINS_AD(BC_JMP, 0, 0);
+    /* Check whether we could at least potentially form an extra loop. */
+    if (J->exitno == 0 && T->snap[0].nslots == 1 && T->snapmap[0] == 0) {
+      /* We can narrow a FORL for some side traces, too. */
+      if (J->pc > J->pt->bc && bc_op(J->pc[-1]) == BC_JFORI &&
+	  bc_d(J->pc[bc_j(J->pc[-1])-1]) == root) {
+	lj_snap_add(J);
+	rec_setup_forl(J, J->pc-1);
+	goto sidecheck;
+      }
+    } else {
+      J->startpc = NULL;  /* Prevent forming an extra loop. */
+    }
+    rec_setup_side(J, T);
+  sidecheck:
+    if (J->trace[J->cur.root]->nchild >= J->param[JIT_P_maxside] ||
+	T->snap[J->exitno].count >= J->param[JIT_P_hotexit] +
+				    J->param[JIT_P_tryside])
+      rec_stop(J, TRACE_INTERP);
+  } else {  /* Root trace. */
+    J->cur.root = 0;
+    if (J->pc >= J->pt->bc) {  /* Not a hot CALL? */
+      J->cur.startins = *J->pc;
+      J->pc = rec_setup_root(J);
+      /* Note: the loop instruction itself is recorded at the end and not
+      ** at the start! So snapshot #0 needs to point to the *next* instruction.
+      */
+    } else {
+      J->cur.startins = BCINS_ABC(BC_CALL, 0, 0, 0);
+    }
+    lj_snap_add(J);
+    if (bc_op(J->cur.startins) == BC_FORL)
+      rec_setup_forl(J, J->pc-1);
+    if (1 + J->pt->framesize >= LJ_MAX_JSLOTS)
+      lj_trace_err(J, LJ_TRERR_STACKOV);
+  }
+}
+
+#undef IR
+#undef emitir_raw
+#undef emitir
+
+#endif

+ 17 - 0
src/lj_record.h

@@ -0,0 +1,17 @@
+/*
+** Trace recorder (bytecode -> SSA IR).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_RECORD_H
+#define _LJ_RECORD_H
+
+#include "lj_obj.h"
+#include "lj_jit.h"
+
+#if LJ_HASJIT
+LJ_FUNC void lj_record_ins(jit_State *J);
+LJ_FUNC void lj_record_setup(jit_State *J);
+#endif
+
+#endif

+ 286 - 0
src/lj_snap.c

@@ -0,0 +1,286 @@
+/*
+** Snapshot handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#define lj_snap_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+
+#if LJ_HASJIT
+
+#include "lj_gc.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_snap.h"
+#include "lj_target.h"
+
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)		(&J->cur.ir[(ref)])
+
+/* -- Snapshot generation ------------------------------------------------- */
+
+/* NYI: Snapshots are in need of a redesign. The current storage model for
+** snapshot maps is too wasteful. They could be compressed (1D or 2D) and
+** made more flexible at the same time. Iterators should no longer need to
+** skip unmodified slots. IR_FRAME should be eliminated, too.
+*/
+
+/* Add all modified slots to the snapshot. */
+static void snapshot_slots(jit_State *J, IRRef2 *map, BCReg nslots)
+{
+  BCReg s;
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = tref_ref(J->slot[s]);
+    if (ref) {
+      IRIns *ir = IR(ref);
+      if (ir->o == IR_SLOAD && ir->op1 == s && !(ir->op2 & IRSLOAD_INHERIT))
+	ref = 0;
+    }
+    map[s] = (IRRef2)ref;
+  }
+}
+
+/* Add frame links at the end of the snapshot. */
+static MSize snapshot_framelinks(jit_State *J, IRRef2 *map)
+{
+  cTValue *frame = J->L->base - 1;
+  cTValue *lim = J->L->base - J->baseslot;
+  MSize f = 0;
+  map[f++] = u32ptr(J->pc);
+  while (frame > lim) {
+    if (frame_islua(frame)) {
+      map[f++] = u32ptr(frame_pc(frame));
+      frame = frame_prevl(frame);
+    } else if (frame_ispcall(frame)) {
+      map[f++] = (uint32_t)frame_ftsz(frame);
+      frame = frame_prevd(frame);
+    } else if (frame_iscont(frame)) {
+      map[f++] = (uint32_t)frame_ftsz(frame);
+      map[f++] = u32ptr(frame_contpc(frame));
+      frame = frame_prevd(frame);
+    } else {
+      lua_assert(0);
+    }
+  }
+  return f;
+}
+
+/* Take a snapshot of the current stack. */
+static void snapshot_stack(jit_State *J, SnapShot *snap, MSize nsnapmap)
+{
+  BCReg nslots = J->baseslot + J->maxslot;
+  MSize nsm, nframelinks;
+  IRRef2 *p;
+  /* Conservative estimate. Continuation frames need 2 slots. */
+  nsm = nsnapmap + nslots + (uint32_t)J->framedepth*2+1;
+  if (LJ_UNLIKELY(nsm > J->sizesnapmap)) {  /* Need to grow snapshot map? */
+    if (nsm < 2*J->sizesnapmap)
+      nsm = 2*J->sizesnapmap;
+    else if (nsm < 64)
+      nsm = 64;
+    J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
+		      J->sizesnapmap*sizeof(IRRef2), nsm*sizeof(IRRef2));
+    J->cur.snapmap = J->snapmapbuf;
+    J->sizesnapmap = nsm;
+  }
+  p = &J->cur.snapmap[nsnapmap];
+  snapshot_slots(J, p, nslots);
+  nframelinks = snapshot_framelinks(J, p + nslots);
+  J->cur.nsnapmap = (uint16_t)(nsnapmap + nslots + nframelinks);
+  snap->mapofs = (uint16_t)nsnapmap;
+  snap->ref = (IRRef1)J->cur.nins;
+  snap->nslots = (uint8_t)nslots;
+  snap->nframelinks = (uint8_t)nframelinks;
+  snap->count = 0;
+}
+
+/* Add or merge a snapshot. */
+void lj_snap_add(jit_State *J)
+{
+  MSize nsnap = J->cur.nsnap;
+  MSize nsnapmap = J->cur.nsnapmap;
+  /* Merge if no ins. inbetween or if requested and no guard inbetween. */
+  if (J->mergesnap ? !irt_isguard(J->guardemit) :
+      (nsnap > 0 && J->cur.snap[nsnap-1].ref == J->cur.nins)) {
+    nsnapmap = J->cur.snap[--nsnap].mapofs;
+  } else {
+    /* Need to grow snapshot buffer? */
+    if (LJ_UNLIKELY(nsnap >= J->sizesnap)) {
+      MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
+      if (nsnap >= maxsnap)
+	lj_trace_err(J, LJ_TRERR_SNAPOV);
+      lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
+      J->cur.snap = J->snapbuf;
+    }
+    J->cur.nsnap = (uint16_t)(nsnap+1);
+  }
+  J->mergesnap = 0;
+  J->guardemit.irt = 0;
+  snapshot_stack(J, &J->cur.snap[nsnap], nsnapmap);
+}
+
+/* Shrink last snapshot. */
+void lj_snap_shrink(jit_State *J)
+{
+  BCReg nslots = J->baseslot + J->maxslot;
+  SnapShot *snap = &J->cur.snap[J->cur.nsnap-1];
+  IRRef2 *oflinks = &J->cur.snapmap[snap->mapofs + snap->nslots];
+  IRRef2 *nflinks = &J->cur.snapmap[snap->mapofs + nslots];
+  uint32_t s, nframelinks = snap->nframelinks;
+  lua_assert(nslots < snap->nslots);
+  snap->nslots = (uint8_t)nslots;
+  J->cur.nsnapmap = (uint16_t)(snap->mapofs + nslots + nframelinks);
+  for (s = 0; s < nframelinks; s++)  /* Move frame links down. */
+    nflinks[s] = oflinks[s];
+}
+
+/* -- Snapshot access ----------------------------------------------------- */
+
+/* Initialize a Bloom Filter with all renamed refs.
+** There are very few renames (often none), so the filter has
+** very few bits set. This makes it suitable for negative filtering.
+*/
+static BloomFilter snap_renamefilter(Trace *T, SnapNo lim)
+{
+  BloomFilter rfilt = 0;
+  IRIns *ir;
+  for (ir = &T->ir[T->nins-1]; ir->o == IR_RENAME; ir--)
+    if (ir->op2 <= lim)
+      bloomset(rfilt, ir->op1);
+  return rfilt;
+}
+
+/* Process matching renames to find the original RegSP. */
+static RegSP snap_renameref(Trace *T, SnapNo lim, IRRef ref, RegSP rs)
+{
+  IRIns *ir;
+  for (ir = &T->ir[T->nins-1]; ir->o == IR_RENAME; ir--)
+    if (ir->op1 == ref && ir->op2 <= lim)
+      rs = ir->prev;
+  return rs;
+}
+
+/* Convert a snapshot into a linear slot -> RegSP map. */
+void lj_snap_regspmap(uint16_t *rsmap, Trace *T, SnapNo snapno)
+{
+  SnapShot *snap = &T->snap[snapno];
+  BCReg s, nslots = snap->nslots;
+  IRRef2 *map = &T->snapmap[snap->mapofs];
+  BloomFilter rfilt = snap_renamefilter(T, snapno);
+  for (s = 0; s < nslots; s++) {
+    IRRef ref = snap_ref(map[s]);
+    if (!irref_isk(ref)) {
+      IRIns *ir = &T->ir[ref];
+      uint32_t rs = ir->prev;
+      if (bloomtest(rfilt, ref))
+	rs = snap_renameref(T, snapno, ref, rs);
+      rsmap[s] = (uint16_t)rs;
+    }
+  }
+}
+
+/* Restore interpreter state from exit state with the help of a snapshot. */
+void lj_snap_restore(jit_State *J, void *exptr)
+{
+  ExitState *ex = (ExitState *)exptr;
+  SnapNo snapno = J->exitno;  /* For now, snapno == exitno. */
+  Trace *T = J->trace[J->parent];
+  SnapShot *snap = &T->snap[snapno];
+  BCReg s, nslots = snap->nslots;
+  IRRef2 *map = &T->snapmap[snap->mapofs];
+  IRRef2 *flinks = map + nslots + snap->nframelinks;
+  TValue *o, *newbase, *ntop;
+  BloomFilter rfilt = snap_renamefilter(T, snapno);
+  lua_State *L = J->L;
+
+  /* Make sure the stack is big enough for the slots from the snapshot. */
+  if (L->base + nslots >= L->maxstack) {
+    L->top = curr_topL(L);
+    lj_state_growstack(L, nslots - curr_proto(L)->framesize);
+  }
+
+  /* Fill stack slots with data from the registers and spill slots. */
+  newbase = NULL;
+  ntop = L->base;
+  for (s = 0, o = L->base-1; s < nslots; s++, o++) {
+    IRRef ref = snap_ref(map[s]);
+    if (ref) {
+      IRIns *ir = &T->ir[ref];
+      if (irref_isk(ref)) {  /* Restore constant slot. */
+	lj_ir_kvalue(L, o, ir);
+      } else {
+	IRType1 t = ir->t;
+	RegSP rs = ir->prev;
+	if (LJ_UNLIKELY(bloomtest(rfilt, ref)))
+	  rs = snap_renameref(T, snapno, ref, rs);
+	if (ra_hasspill(regsp_spill(rs))) {  /* Restore from spill slot. */
+	  int32_t *sps = &ex->spill[regsp_spill(rs)];
+	  if (irt_isinteger(t)) {
+	    setintV(o, *sps);
+	  } else if (irt_isnum(t)) {
+	    o->u64 = *(uint64_t *)sps;
+	  } else {
+	    lua_assert(!irt_ispri(t));  /* PRI refs never have a spill slot. */
+	    setgcrefi(o->gcr, *sps);
+	    setitype(o, irt_toitype(t));
+	  }
+	} else if (ra_hasreg(regsp_reg(rs))) {  /* Restore from register. */
+	  Reg r = regsp_reg(rs);
+	  if (irt_isinteger(t)) {
+	    setintV(o, ex->gpr[r-RID_MIN_GPR]);
+	  } else if (irt_isnum(t)) {
+	    setnumV(o, ex->fpr[r-RID_MIN_FPR]);
+	  } else {
+	    if (!irt_ispri(t))
+	      setgcrefi(o->gcr, ex->gpr[r-RID_MIN_GPR]);
+	    setitype(o, irt_toitype(t));
+	  }
+	} else {  /* Restore frame slot. */
+	  lua_assert(ir->o == IR_FRAME);
+	  /* This works for both PTR and FUNC IR_FRAME. */
+	  setgcrefp(o->fr.func, mref(T->ir[ir->op2].ptr, void));
+	  if (s != 0)  /* Do not overwrite link to previous frame. */
+	    o->fr.tp.ftsz = (int32_t)*--flinks;
+	  if (irt_isfunc(ir->t)) {
+	    GCfunc *fn = gco2func(gcref(T->ir[ir->op2].gcr));
+	    if (isluafunc(fn)) {
+	      TValue *fs;
+	      newbase = o+1;
+	      fs = newbase + funcproto(fn)->framesize;
+	      if (fs > ntop) ntop = fs; /* Update top for newly added frames. */
+	    }
+	  }
+	}
+      }
+    } else if (newbase) {
+      setnilV(o);  /* Clear unreferenced slots of newly added frames. */
+    }
+  }
+  if (newbase) {  /* Clear remainder of newly added frames. */
+    L->base = newbase;
+    if (ntop >= L->maxstack) {  /* Need to grow the stack again. */
+      MSize need = (MSize)(ntop - o);
+      L->top = o;
+      lj_state_growstack(L, need);
+      o = L->top;
+      ntop = o + need;
+    }
+    L->top = curr_topL(L);
+    for (; o < ntop; o++)
+      setnilV(o);
+  } else {  /* Must not clear slots of existing frame. */
+    L->top = curr_topL(L);
+  }
+  lua_assert(map + nslots == flinks-1);
+  J->pc = (const BCIns *)(uintptr_t)(*--flinks);
+}
+
+#undef IR
+
+#endif

+ 19 - 0
src/lj_snap.h

@@ -0,0 +1,19 @@
+/*
+** Snapshot handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_SNAP_H
+#define _LJ_SNAP_H
+
+#include "lj_obj.h"
+#include "lj_jit.h"
+
+#if LJ_HASJIT
+LJ_FUNC void lj_snap_add(jit_State *J);
+LJ_FUNC void lj_snap_shrink(jit_State *J);
+LJ_FUNC void lj_snap_regspmap(uint16_t *rsmap, Trace *T, SnapNo snapno);
+LJ_FUNC void lj_snap_restore(jit_State *J, void *exptr);
+#endif
+
+#endif

+ 255 - 0
src/lj_state.c

@@ -0,0 +1,255 @@
+/*
+** State and stack handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+**
+** Portions taken verbatim or adapted from the Lua interpreter.
+** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
+*/
+
+#define lj_state_c
+#define LUA_CORE
+
+#include "lj_obj.h"
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_tab.h"
+#include "lj_func.h"
+#include "lj_meta.h"
+#include "lj_state.h"
+#include "lj_frame.h"
+#include "lj_trace.h"
+#include "lj_dispatch.h"
+#include "lj_vm.h"
+#include "lj_lex.h"
+#include "lj_alloc.h"
+
+/* -- Stack handling ------------------------------------------------------ */
+
+/* Stack sizes. */
+#define LJ_STACK_MIN	LUA_MINSTACK	/* Min. stack size. */
+#define LJ_STACK_MAX	LUAI_MAXSTACK	/* Max. stack size. */
+#define LJ_STACK_START	(2*LJ_STACK_MIN)	/* Starting stack size. */
+#define LJ_STACK_MAXEX	(LJ_STACK_MAX + 1 + LJ_STACK_EXTRA)
+
+/* Explanation of LJ_STACK_EXTRA:
+**
+** Calls to metamethods store their arguments beyond the current top
+** without checking for the stack limit. This avoids stack resizes which
+** would invalidate passed TValue pointers. The stack check is performed
+** later by the call gate. This can safely resize the stack or raise an
+** error. Thus we need some extra slots beyond the current stack limit.
+**
+** Most metamethods need 4 slots above top (cont, mobj, arg1, arg2) plus
+** one extra slot if mobj is not a function. Only lj_meta_tset needs 5
+** slots above top, but then mobj is always a function. So we can get by
+** with 5 extra slots.
+*/
+
+/* Resize stack slots and adjust pointers in state. */
+static void resizestack(lua_State *L, MSize n)
+{
+  TValue *oldst = L->stack;
+  ptrdiff_t delta;
+  MSize realsize = n + 1 + LJ_STACK_EXTRA;
+  GCobj *up;
+  lua_assert((MSize)(L->maxstack-L->stack) == L->stacksize-LJ_STACK_EXTRA-1);
+  lj_mem_reallocvec(L, L->stack, L->stacksize, realsize, TValue);
+  delta = (char *)L->stack - (char *)oldst;
+  L->maxstack = L->stack + n;
+  L->stacksize = realsize;
+  L->base = (TValue *)((char *)L->base + delta);
+  L->top = (TValue *)((char *)L->top + delta);
+  for (up = gcref(L->openupval); up != NULL; up = gcnext(up))
+    gco2uv(up)->v = (TValue *)((char *)gco2uv(up)->v + delta);
+  if (obj2gco(L) == gcref(G(L)->jit_L))
+    setmref(G(L)->jit_base, mref(G(L)->jit_base, char) + delta);
+}
+
+/* Relimit stack after error, in case the limit was overdrawn. */
+void lj_state_relimitstack(lua_State *L)
+{
+  if (L->stacksize > LJ_STACK_MAXEX && L->top - L->stack < LJ_STACK_MAX-1)
+    resizestack(L, LJ_STACK_MAX);
+}
+
+/* Try to shrink the stack (called from GC). */
+void lj_state_shrinkstack(lua_State *L, MSize used)
+{
+  if (L->stacksize > LJ_STACK_MAXEX)
+    return;  /* Avoid stack shrinking while handling stack overflow. */
+  if (4*used < L->stacksize &&
+      2*(LJ_STACK_START+LJ_STACK_EXTRA) < L->stacksize &&
+      obj2gco(L) != gcref(G(L)->jit_L))  /* Don't shrink stack of live trace. */
+    resizestack(L, L->stacksize >> 1);
+}
+
+/* Try to grow stack. */
+void lj_state_growstack(lua_State *L, MSize need)
+{
+  if (L->stacksize > LJ_STACK_MAXEX)  /* overflow while handling overflow? */
+    lj_err_throw(L, LUA_ERRERR);
+  resizestack(L, L->stacksize + (need > L->stacksize ? need : L->stacksize));
+  if (L->stacksize > LJ_STACK_MAXEX) {
+    if (curr_funcisL(L)) {  /* Clear slots of incomplete Lua frame. */
+      TValue *top = curr_topL(L);
+      while (--top >= L->top) setnilV(top);
+    }
+    lj_err_msg(L, LJ_ERR_STKOV);  /* ... to allow L->top = curr_topL(L). */
+  }
+}
+
+void lj_state_growstack1(lua_State *L)
+{
+  lj_state_growstack(L, 1);
+}
+
+/* Allocate basic stack for new state. */
+static void stack_init(lua_State *L1, lua_State *L)
+{
+  L1->stack = lj_mem_newvec(L, LJ_STACK_START + LJ_STACK_EXTRA, TValue);
+  L1->stacksize = LJ_STACK_START + LJ_STACK_EXTRA;
+  L1->top = L1->stack;
+  L1->maxstack = L1->stack+(L1->stacksize - LJ_STACK_EXTRA)-1;
+  setthreadV(L1, L1->top, L1);  /* needed for curr_funcisL() on empty stack */
+  setnilV(L1->top);  /* but clear its type */
+  L1->base = ++L1->top;
+}
+
+/* -- State handling ------------------------------------------------------ */
+
+/* Open parts that may cause memory-allocation errors. */
+static TValue *cpluaopen(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  global_State *g = G(L);
+  UNUSED(dummy);
+  UNUSED(ud);
+  stack_init(L, L);
+  /* NOBARRIER: State initialization, all objects are white. */
+  setgcref(L->env, obj2gco(lj_tab_new(L, 0, LJ_MIN_GLOBAL)));
+  settabV(L, registry(L), lj_tab_new(L, 0, LJ_MIN_REGISTRY));
+  lj_str_resize(L, LJ_MIN_STRTAB-1);
+  lj_meta_init(L);
+  lj_lex_init(L);
+  fixstring(lj_err_str(L, LJ_ERR_ERRMEM));  /* Preallocate memory error msg. */
+  g->gc.threshold = 4*g->gc.total;
+  return NULL;
+}
+
+static void close_state(lua_State *L)
+{
+  global_State *g = G(L);
+#ifndef LUAJIT_USE_SYSMALLOC
+  if (g->allocf == lj_alloc_f) {
+    lj_alloc_destroy(g->allocd);
+  } else
+#endif
+  {
+    lj_func_closeuv(L, L->stack);
+    lj_gc_freeall(g);
+    lua_assert(gcref(g->gc.root) == obj2gco(L));
+    lua_assert(g->strnum == 0);
+    lj_trace_freestate(g);
+    lj_mem_freevec(g, g->strhash, g->strmask+1, GCstr *);
+    lj_str_freebuf(g, &g->tmpbuf);
+    lj_mem_freevec(g, L->stack, L->stacksize, TValue);
+    lua_assert(g->gc.total == sizeof(GG_State));
+    g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0);
+  }
+}
+
+LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud)
+{
+  GG_State *GG = cast(GG_State *, f(ud, NULL, 0, sizeof(GG_State)));
+  lua_State *L = &GG->L;
+  global_State *g = &GG->g;
+  if (GG == NULL) return NULL;
+  memset(GG, 0, sizeof(GG_State));
+  L->gct = ~LJ_TTHREAD;
+  L->marked = LJ_GC_WHITE0 | LJ_GC_FIXED | LJ_GC_SFIXED;  /* Prevent free. */
+  L->dummy_ffid = FF_C;
+  setmref(L->glref, g);
+  g->gc.currentwhite = LJ_GC_WHITE0 | LJ_GC_FIXED;
+  g->allocf = f;
+  g->allocd = ud;
+  setgcref(g->mainthref, obj2gco(L));
+  setgcref(g->uvhead.prev, obj2gco(&g->uvhead));
+  setgcref(g->uvhead.next, obj2gco(&g->uvhead));
+  g->strmask = ~(MSize)0;
+  setnilV(registry(L));
+  setnilV(&g->nilnode.val);
+  setnilV(&g->nilnode.key);
+  lj_str_initbuf(L, &g->tmpbuf);
+  g->gc.state = GCSpause;
+  setgcref(g->gc.root, obj2gco(L));
+  g->gc.sweep = &g->gc.root;
+  g->gc.total = sizeof(GG_State);
+  g->gc.pause = LUAI_GCPAUSE;
+  g->gc.stepmul = LUAI_GCMUL;
+  lj_dispatch_init((GG_State *)L);
+  L->status = LUA_ERRERR+1;  /* Avoid touching the stack upon memory error. */
+  if (lj_vm_cpcall(L, cpluaopen, NULL, NULL) != 0) {
+    /* Memory allocation error: free partial state. */
+    close_state(L);
+    return NULL;
+  }
+  L->status = 0;
+  return L;
+}
+
+static TValue *cpfinalize(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  UNUSED(dummy);
+  UNUSED(ud);
+  lj_gc_finalizeudata(L);
+  /* Frame pop omitted. */
+  return NULL;
+}
+
+LUA_API void lua_close(lua_State *L)
+{
+  global_State *g = G(L);
+  L = mainthread(g);  /* Only the main thread can be closed. */
+  lj_func_closeuv(L, L->stack);
+  lj_gc_separateudata(g, 1);  /* Separate udata which have GC metamethods. */
+#if LJ_HASJIT
+  G2J(g)->flags &= ~JIT_F_ON;
+  G2J(g)->state = LJ_TRACE_IDLE;
+  lj_dispatch_update(g);
+#endif
+  do {
+    hook_enter(g);
+    L->status = 0;
+    L->cframe = NULL;
+    L->base = L->top = L->stack + 1;
+  } while (lj_vm_cpcall(L, cpfinalize, NULL, NULL) != 0);
+  close_state(L);
+}
+
+lua_State *lj_state_new(lua_State *L)
+{
+  lua_State *L1 = lj_mem_newobj(L, lua_State);
+  L1->gct = ~LJ_TTHREAD;
+  L1->dummy_ffid = FF_C;
+  L1->status = 0;
+  L1->stacksize = 0;
+  L1->stack = NULL;
+  L1->cframe = NULL;
+  /* NOBARRIER: The lua_State is new (marked white). */
+  setgcrefnull(L1->openupval);
+  setmrefr(L1->glref, L->glref);
+  setgcrefr(L1->env, L->env);
+  stack_init(L1, L);  /* init stack */
+  lua_assert(iswhite(obj2gco(L1)));
+  return L1;
+}
+
+void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L)
+{
+  lua_assert(L != mainthread(g));
+  lj_func_closeuv(L, L->stack);
+  lua_assert(gcref(L->openupval) == NULL);
+  lj_mem_freevec(g, L->stack, L->stacksize, TValue);
+  lj_mem_freet(g, L);
+}
+

+ 31 - 0
src/lj_state.h

@@ -0,0 +1,31 @@
+/*
+** State and stack handling.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+
+#ifndef _LJ_STATE_H
+#define _LJ_STATE_H
+
+#include "lj_obj.h"
+
+#define incr_top(L) \
+  (++L->top >= L->maxstack && (lj_state_growstack1(L), 0))
+
+#define savestack(L, p)		((char *)(p) - (char *)L->stack)
+#define restorestack(L, n)	((TValue *)((char *)L->stack + (n)))
+
+LJ_FUNC void lj_state_relimitstack(lua_State *L);
+LJ_FUNC void lj_state_shrinkstack(lua_State *L, MSize used);
+LJ_FUNCA void lj_state_growstack(lua_State *L, MSize need);
+LJ_FUNCA void lj_state_growstack1(lua_State *L);
+
+static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need)
+{
+  if ((MSize)((char *)L->maxstack-(char *)L->top) <= need*(MSize)sizeof(TValue))
+    lj_state_growstack(L, need);
+}
+
+LJ_FUNC lua_State *lj_state_new(lua_State *L);
+LJ_FUNC void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L);
+
+#endif

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.