Browse Source

Parse Unicode string escape \u{XX...}.

Thanks to drbo.
Mike Pall 10 years ago
parent
commit
55c3b29f7b
3 changed files with 35 additions and 0 deletions
  1. 1 0
      doc/changes.html
  2. 7 0
      doc/extensions.html
  3. 27 0
      src/lj_lex.c

+ 1 - 0
doc/changes.html

@@ -86,6 +86,7 @@ Please take a look at the commit history for more details.
 <li>Add <tt>LJ_GC64</tt> mode: 64 bit GC object references (really: 47 bit). Interpreter-only for now.</li>
 <li>Add <tt>LJ_GC64</tt> mode: 64 bit GC object references (really: 47 bit). Interpreter-only for now.</li>
 <li>Add <tt>LJ_FR2</tt> mode: Two-slot frame info. Required by <tt>LJ_GC64</tt> mode.</li>
 <li>Add <tt>LJ_FR2</tt> mode: Two-slot frame info. Required by <tt>LJ_GC64</tt> mode.</li>
 <li>Add <tt>table.new()</tt> and <tt>table.clear()</tt>.</li>
 <li>Add <tt>table.new()</tt> and <tt>table.clear()</tt>.</li>
+<li>Parse Unicode escape <tt>'\u{XX...}'</tt> in string literals.</li>
 <li>Parse binary number literals (<tt>0bxxx</tt>).</li>
 <li>Parse binary number literals (<tt>0bxxx</tt>).</li>
 </ul></li>
 </ul></li>
 <li>Improvements to the JIT compiler:
 <li>Improvements to the JIT compiler:

+ 7 - 0
doc/extensions.html

@@ -344,6 +344,13 @@ Lua&nbsp;5.1, which prevents implementing features that would otherwise
 break the Lua/C API and ABI (e.g. <tt>_ENV</tt>).
 break the Lua/C API and ABI (e.g. <tt>_ENV</tt>).
 </p>
 </p>
 
 
+<h2 id="lua53">Extensions from Lua 5.3</h2>
+<p>
+LuaJIT supports some extensions from Lua&nbsp;5.3:
+<ul>
+<li>Unicode escape <tt>'\u{XX...}'</tt> embeds the UTF-8 encoding in string literals.</li>
+</ul>
+
 <h2 id="exceptions">C++ Exception Interoperability</h2>
 <h2 id="exceptions">C++ Exception Interoperability</h2>
 <p>
 <p>
 LuaJIT has built-in support for interoperating with C++&nbsp;exceptions.
 LuaJIT has built-in support for interoperating with C++&nbsp;exceptions.

+ 27 - 0
src/lj_lex.c

@@ -214,6 +214,33 @@ static void lex_string(LexState *ls, TValue *tv)
 	  c += 9;
 	  c += 9;
 	}
 	}
 	break;
 	break;
+      case 'u':  /* Unicode escape '\u{XX...}'. */
+	if (lex_next(ls) != '{') goto err_xesc;
+	lex_next(ls);
+	c = 0;
+	do {
+	  c = (c << 4) | (ls->c & 15u);
+	  if (!lj_char_isdigit(ls->c)) {
+	    if (!lj_char_isxdigit(ls->c)) goto err_xesc;
+	    c += 9;
+	  }
+	  if (c >= 0x110000) goto err_xesc;  /* Out of Unicode range. */
+	} while (lex_next(ls) != '}');
+	if (c < 0x800) {
+	  if (c < 0x80) break;
+	  lex_save(ls, 0xc0 | (c >> 6));
+	} else {
+	  if (c >= 0x10000) {
+	    lex_save(ls, 0xf0 | (c >> 18));
+	    lex_save(ls, 0x80 | ((c >> 12) & 0x3f));
+	  } else {
+	    if (c >= 0xd800 && c < 0xe000) goto err_xesc;  /* No surrogates. */
+	    lex_save(ls, 0xe0 | (c >> 12));
+	  }
+	  lex_save(ls, 0x80 | ((c >> 6) & 0x3f));
+	}
+	c = 0x80 | (c & 0x3f);
+	break;
       case 'z':  /* Skip whitespace. */
       case 'z':  /* Skip whitespace. */
 	lex_next(ls);
 	lex_next(ls);
 	while (lj_char_isspace(ls->c))
 	while (lj_char_isspace(ls->c))