Przeglądaj źródła

Initial port of gumbo html parser library interface from the lua module.
Also added the minimal of the gumbo source code to this repository.

mingodad 12 lat temu
rodzic
commit
bbb026c22e

+ 213 - 0
SquiLu-ext/sq_gumbo.cpp

@@ -0,0 +1,213 @@
+/// @module gumbo
+// Lua bindings for the [Gumbo][] HTML5 parsing library.
+// [Gumbo]: https://github.com/google/gumbo-parser
+// @copyright 2013 Craig Barnes
+// @license ISC
+// Ported to Squilu by Domingo Alvarez Duarte
+
+#include "squirrel.h"
+#include <string.h>
+#include "sqstdblobimpl.h"
+SQ_OPT_STRING_STRLEN();
+#include "gumbo.h"
+#include <stdio.h>
+#include <errno.h>
+
+#define MYNAME		_SC("gumbo")
+
+#define assert(cond) if (!(cond)) goto error
+static SQRESULT build_node(HSQUIRRELVM v, GumboNode* node);
+
+static void reg_string(HSQUIRRELVM v, const char *name, const char *val) {
+  sq_pushstring(v, name, -1);
+  if(val) sq_pushstring(v, val, -1);
+  else sq_pushnull(v);
+  sq_rawset(v, -3);
+}
+
+static void reg_boolean(HSQUIRRELVM v, const char *name, SQBool val) {
+  sq_pushstring(v, name, -1);
+  sq_pushbool(v, val);
+  sq_rawset(v, -3);
+}
+
+static inline SQRESULT add_children(HSQUIRRELVM v, GumboVector *children) {
+    unsigned int tl = 0;
+    for (unsigned int i = 0, cl = children->length; i < cl; i++) {
+        switch(build_node(v, (GumboNode*)children->data[i])){
+            case SQTrue:
+                sq_arrayset(v, -2, tl++);
+                break;
+            case SQFalse:
+                break;
+            case SQ_ERROR:
+                return SQ_ERROR;
+        }
+    }
+    if(tl < children->length) {
+        sq_arrayresize(v, -1, tl);
+    }
+    return SQ_OK;
+}
+
+static SQRESULT build_document(HSQUIRRELVM v, GumboDocument *document) {
+    sq_newtableex(v, 6);
+    reg_string(v, _SC("name"), document->name);
+    reg_string(v, _SC("public_identifier"), document->public_identifier);
+    reg_string(v, _SC("system_identifier"), document->system_identifier);
+    reg_boolean(v, _SC("has_doctype"), document->has_doctype ? SQTrue : SQFalse);
+    sq_pushliteral(v, _SC("children"));
+    sq_newarray(v, document->children.length);
+    if(add_children(v, &document->children) == SQ_ERROR) return SQ_ERROR;
+    sq_rawset(v, -3);
+    return 1;
+}
+
+static SQRESULT build_element(HSQUIRRELVM v, GumboElement *element) {
+    unsigned int nattrs = element->attributes.length;
+    sq_newtableex(v, nattrs ? 3 : 2);
+
+    // Add tag name
+    sq_pushliteral(v, _SC("tag"));
+    if (element->tag == GUMBO_TAG_UNKNOWN) {
+        GumboStringPiece original_tag = element->original_tag;
+        gumbo_tag_from_original_text(&original_tag);
+        sq_pushstring(v, original_tag.data, original_tag.length);
+    } else {
+        sq_pushstring(v, gumbo_normalized_tagname(element->tag), -1);
+    }
+    sq_rawset(v, -3);
+
+    // Add attributes
+    if (nattrs) {
+        sq_pushliteral(v, _SC("attr"));
+        sq_newtableex(v, nattrs);
+        for (unsigned int i = 0; i < nattrs; ++i) {
+            GumboAttribute *attribute = (GumboAttribute *)element->attributes.data[i];
+            reg_string(v, attribute->name, attribute->value);
+        }
+        sq_rawset(v, -3);
+    }
+
+    sq_pushliteral(v, _SC("children"));
+    sq_newarray(v, element->children.length);
+    if(add_children(v, &element->children) == SQ_ERROR) return SQ_ERROR;
+    sq_rawset(v, -3);
+    return SQ_OK;
+}
+
+static SQRESULT build_node(HSQUIRRELVM v, GumboNode* node) {
+    switch (node->type) {
+    case GUMBO_NODE_DOCUMENT:
+        build_document(v, &node->v.document);
+        return SQTrue;
+
+    case GUMBO_NODE_ELEMENT:
+        build_element(v, &node->v.element);
+        return SQTrue;
+
+    case GUMBO_NODE_COMMENT:
+        sq_newtableex(v, 1);
+        reg_string(v, _SC("comment"), node->v.text.text);
+        return SQTrue;
+
+    case GUMBO_NODE_TEXT:
+    case GUMBO_NODE_CDATA:
+        sq_pushstring(v, node->v.text.text, -1);
+        return SQTrue;
+
+    case GUMBO_NODE_WHITESPACE:
+        return SQFalse;
+
+    default:
+        return sq_throwerror(v, _SC("Invalid node type"));
+    }
+}
+
+static inline SQRESULT parse(HSQUIRRELVM v, const SQChar *input, SQInteger len) {
+    GumboOutput *output;
+    output = gumbo_parse_with_options(&kGumboDefaultOptions, input, len);
+    SQRESULT result = build_node(v, output->document);
+    if(result == SQ_ERROR) {
+        gumbo_destroy_output(&kGumboDefaultOptions, output);
+        return SQ_ERROR;
+    }
+    sq_pushliteral(v, _SC("children"));
+    sq_rawget(v, -2);
+    sq_pushliteral(v, _SC("root"));
+    sq_arrayget(v, -2, output->root->index_within_parent);
+    sq_rawset(v, -4); //set root on main table
+    sq_poptop(v); //remove children array from stack
+    gumbo_destroy_output(&kGumboDefaultOptions, output);
+    return result;
+}
+
+/// Parse a string of HTML
+// @function parse
+// @param document String containing HTML
+// @return Abstract syntax tree table
+// @see README.md
+static SQRESULT gumbo_parse(HSQUIRRELVM v)		/** parse(s) */
+{
+    SQ_FUNC_VARS_NO_TOP(v);
+    SQ_GET_STRING(v, 2, input);
+    return parse(v, input, input_size);
+}
+
+/// Read and parse a HTML file
+// @function parse_file
+// @param filename Path to HTML file
+// @return Abstract syntax tree table
+// @throw exception (if opening or reading file fails)
+static SQRESULT gumbo_parse_file(HSQUIRRELVM v)		/** parse_file(s) */
+{
+    SQRESULT result;
+    SQ_FUNC_VARS_NO_TOP(v);
+    SQ_GET_STRING(v, 2, filename);
+    FILE *file = NULL;
+    char *input = NULL;
+    long len;
+
+    assert(file = fopen(filename, "rb"));
+    assert(fseek(file, 0, SEEK_END) != -1);
+    assert((len = ftell(file)) != -1);
+    rewind(file);
+    assert(input = (char*)sq_malloc(len + 1));
+    assert(fread(input, 1, len, file) == (unsigned long)len);
+    fclose(file);
+    input[len] = '\0';
+    result = parse(v, input, len);
+    sq_free(input, len+1);
+    return result;
+
+  error: // Return nil and an error message if an assertion fails
+    if (file) fclose(file);
+    if (input) sq_free(input, len+1);
+    return sq_throwerror(v, strerror(errno));
+}
+
+#define _DECL_FUNC(name,nparams,tycheck) {_SC(#name),gumbo_##name,nparams,tycheck}
+static SQRegFunction gumbo_methods[] =
+{
+    _DECL_FUNC(parse,2,_SC(".s")),
+    _DECL_FUNC(parse_file,2,_SC(".s")),
+    {0,0}
+};
+#undef _DECL_FUNC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+SQRESULT sqext_register_gumbo(HSQUIRRELVM v)
+{
+    sq_pushstring(v,_SC("gumbo"),-1);
+    sq_newtable(v);
+    sq_insert_reg_funcs(v, gumbo_methods);
+    sq_newslot(v,-3,SQTrue);
+    return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif

+ 2 - 0
SquiLu/sq/sq.c

@@ -533,6 +533,7 @@ SQRESULT sqext_register_ThreadObjects(HSQUIRRELVM v);
 SQRESULT sqext_register_csv_parser (HSQUIRRELVM v);
 SQRESULT sqext_register_fltklib(HSQUIRRELVM v);
 SQRESULT sqext_register_dad_utils(HSQUIRRELVM v);
+SQRESULT sqext_register_gumbo(HSQUIRRELVM v);
 
 int main(int argc, char* argv[])
 {
@@ -559,6 +560,7 @@ int main(int argc, char* argv[])
 
 #ifdef WITH_DAD_EXTRAS
 #ifndef SQUILU_ALONE
+	sqext_register_gumbo(v);
 	sqext_register_base64(v);
 	sqext_register_Sq_Fpdf(v);
 	sqext_register_SQLite3(v);

+ 65 - 1
SquiLu/squilu.cbp

@@ -76,7 +76,7 @@
 				<Option output="bin/squilu.exe" prefix_auto="1" extension_auto="0" />
 				<Option object_output="obj/Release-win32/" />
 				<Option type="1" />
-				<Option compiler="mingw32_compiler" />
+				<Option compiler="mingw_gnu_gcc_compiler" />
 				<Compiler>
 					<Add option="-O3" />
 					<Add option="-Wall" />
@@ -439,6 +439,34 @@
 					<Add directory="../libharu/src" />
 				</Linker>
 			</Target>
+			<Target title="Debug 64bits">
+				<Option output="bin/squilu-dbg" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Debug/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+					<Add option="-Wall" />
+					<Add option="-g" />
+					<Add option="-DNDEBUG=1" />
+					<Add option="-DWITH_FULL_DAD_EXTRAS=1" />
+					<Add option="-D_SQ64=1" />
+					<Add option="-DCONFIG_64=1" />
+					<Add directory="../../zeromq-3.2.2/include" />
+				</Compiler>
+				<Linker>
+					<Add library="../../zeromq-3.2.2/libzmq3.a" />
+					<Add library="pthread" />
+					<Add library="rt" />
+					<Add library="dl" />
+					<Add library="axtls" />
+					<Add library="mpdecimal" />
+					<Add library="discount" />
+					<Add library="fltk_z" />
+					<Add directory="../../zeromq-3.2.2" />
+					<Add directory="../fltk/lib" />
+				</Linker>
+			</Target>
 		</Build>
 		<Compiler>
 			<Add option="-Wall" />
@@ -493,6 +521,7 @@
 		</Compiler>
 		<Linker>
 			<Add library="m" />
+			<Add library="uuid" />
 			<Add directory="lib" />
 			<Add directory="../myaxtls" />
 			<Add directory="../mpdecimal" />
@@ -525,6 +554,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/dynamic_library.h">
 			<Option target="Debug" />
@@ -538,6 +568,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/fpdf.cpp">
 			<Option target="Debug" />
@@ -551,6 +582,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/fpdf.h">
 			<Option target="Debug" />
@@ -564,6 +596,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/lua_socket.cpp">
 			<Option target="Debug" />
@@ -577,6 +610,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/lua_socket.h">
 			<Option target="Debug" />
@@ -590,6 +624,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/mongoose.c">
 			<Option compilerVar="CC" />
@@ -604,6 +639,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/mongoose.h">
 			<Option target="Debug" />
@@ -617,6 +653,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/pdf-font.cpp">
 			<Option target="Debug" />
@@ -630,6 +667,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/pdf-font.h">
 			<Option target="Debug" />
@@ -643,6 +681,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_axtls.c">
 			<Option compilerVar="CC" />
@@ -657,6 +696,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_base64.cpp">
 			<Option target="Debug" />
@@ -670,6 +710,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_decimal.cpp">
 			<Option target="Debug" />
@@ -682,6 +723,7 @@
 			<Option target="Release FLTK win32 no console" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_fltk.cpp">
 			<Option target="Debug" />
@@ -695,6 +737,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_fpdf.cpp">
 			<Option target="Debug" />
@@ -708,6 +751,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_fs.c">
 			<Option compilerVar="CC" />
@@ -722,6 +766,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_markdown.cpp">
 			<Option target="Debug" />
@@ -734,6 +779,7 @@
 			<Option target="Release FLTK win32 no console" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_mix.cpp">
 			<Option target="Debug" />
@@ -747,6 +793,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_mongoose.cpp">
 			<Option target="Debug" />
@@ -760,6 +807,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_mysql.cpp" />
 		<Unit filename="../SquiLu-ext/sq_parsecsv.cpp">
@@ -773,6 +821,7 @@
 			<Option target="Release FLTK win32 no console" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_postgresql.cpp">
 			<Option target="Debug" />
@@ -785,6 +834,7 @@
 			<Option target="Release FLTK win32 no console" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_rs232.c">
 			<Option compilerVar="CC" />
@@ -799,6 +849,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_slave_vm.cpp">
 			<Option target="Debug" />
@@ -812,6 +863,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_socket.cpp">
 			<Option target="Debug" />
@@ -825,6 +877,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_sqlite3.cpp">
 			<Option target="Debug" />
@@ -838,6 +891,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_tinyxml2.cpp">
 			<Option target="Debug" />
@@ -851,6 +905,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_zlib.cpp">
 			<Option target="Debug" />
@@ -864,6 +919,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sq_zmq3.cpp">
 			<Option target="Debug" />
@@ -876,6 +932,7 @@
 			<Option target="Release FLTK win32 no console" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sqlite3.c">
 			<Option compilerVar="CC" />
@@ -890,6 +947,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sqlite3.h">
 			<Option target="Debug" />
@@ -903,6 +961,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sqmodule.h">
 			<Option target="Debug" />
@@ -916,6 +975,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sqratimport.cpp">
 			<Option target="Debug" />
@@ -929,6 +989,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/sqratimport.h">
 			<Option target="Debug" />
@@ -942,6 +1003,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/tinyxml2.cpp">
 			<Option target="Debug" />
@@ -955,6 +1017,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="../SquiLu-ext/tinyxml2.h">
 			<Option target="Debug" />
@@ -968,6 +1031,7 @@
 			<Option target="Release wince" />
 			<Option target="Release 64bits" />
 			<Option target="Release FLTK 64bits" />
+			<Option target="Debug 64bits" />
 		</Unit>
 		<Unit filename="dadbiz.rc">
 			<Option compilerVar="WINDRES" />

+ 12 - 0
gumbo/CONTRIBUTING.md

@@ -0,0 +1,12 @@
+Contributing
+===========
+Bug reports are very much welcome.  Please use GitHub's issue-tracking feature, as it makes it easier to keep track of bugs and makes it possible for other project watchers to view the existing issues.
+
+Patches and pull requests are also welcome, but before accepting patches, I need you to sign the Google Contributor License Agreement:
+
+https://developers.google.com/open-source/cla/individual
+https://developers.google.com/open-source/cla/corporate
+
+(Electronic signatures are fine for individual contributors.)
+
+If you're unwilling to do this, it would be most helpful if you could file bug reports that include detailed prose about where in the code the error is and how to fix it, but leave out exact source code.

+ 201 - 0
gumbo/COPYING

@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 188 - 0
gumbo/README.md

@@ -0,0 +1,188 @@
+Gumbo - A pure-C HTML5 parser.
+============
+
+Gumbo is an implementation of the [HTML5 parsing algorithm][] implemented
+as a pure C99 library with no outside dependencies.  It's designed to serve
+as a building block for other tools and libraries such as linters,
+validators, templating languages, and refactoring and analysis tools.
+
+Goals & features:
+
+* Fully conformant with the [HTML5 spec][].
+* Robust and resilient to bad input.
+* Simple API that can be easily wrapped by other languages.
+* Support for source locations and pointers back to the original text.
+* Relatively lightweight, with no outside dependencies.
+* Passes all [html5lib-0.95 tests][].
+* Tested on over 2.5 billion pages from Google's index.
+
+Non-goals:
+
+* Execution speed.  Gumbo gains some of this by virtue of being written in
+  C, but it is not an important consideration for the intended use-case, and
+  was not a major design factor.
+* Support for encodings other than UTF-8.  For the most part, client code
+  can convert the input stream to UTF-8 text using another library before
+  processing.
+* Security.  Gumbo was initially designed for a product that worked with
+  trusted input files only.  We're working to harden this and make sure that it
+  behaves as expected even on malicious input, but for now, Gumbo should only be
+  run on trusted input or within a sandbox.
+* C89 support.  Most major compilers support C99 by now; the major exception
+  (Microsoft Visual Studio) should be able to compile this in C++ mode with
+  relatively few changes.  (Bug reports welcome.)
+
+Wishlist (aka "We couldn't get these into the original release, but are
+hoping to add them soon"):
+
+* Support for recent HTML5 spec changes to support the template tag.
+* Support for fragment parsing.
+* Full-featured error reporting.
+* Bindings in other languages.
+
+Installation
+============
+
+To build and install the library, issue the standard UNIX incantation from
+the root of the distribution:
+
+    $ ./autogen.sh
+    $ ./configure
+    $ make
+    $ sudo make install
+
+Gumbo comes with full pkg-config support, so you can use the pkg-config to
+print the flags needed to link your program against it:
+
+    $ pkg-config --cflags gumbo         # print compiler flags
+    $ pkg-config --libs gumbo           # print linker flags
+    $ pkg-config --cflags --libs gumbo  # print both
+
+For example:
+
+    $ gcc my_program.c `pkg-config --cflags --libs gumbo`
+
+See the pkg-config man page for more info.
+
+There are a number of sample programs in the examples/ directory.  They're
+built automatically by 'make', but can also be made individually with
+`make <programname>` (eg. `make clean_text`).
+
+To run the unit tests, you'll need to have [googletest][] downloaded and
+unzipped.  The googletest maintainers recommend against using
+`make install`; instead, symlink the root googletest directory to 'gtest'
+inside gumbo's root directory, and then `make check`:
+
+    $ unzip gtest-1.6.0.zip
+    $ cd gumbo-*
+    $ ln -s ../gtest-1.6.0 gtest
+    $ make check
+
+Gumbo's `make check` has code to automatically configure & build gtest and
+then link in the library.
+
+Debian and Fedora users can install libgtest with:
+
+    $ apt-get install libgtest-dev  # Debian/Ubuntu
+    $ yum install gtest-devel       # CentOS/Fedora
+
+Note for Ubuntu users: libgtest-dev package only install source files.
+You have to make libraries yourself using cmake:
+
+    $ sudo apt-get install cmake
+    $ cd /usr/src/gtest
+    $ sudo cmake CMakeLists.txt
+    $ sudo make
+    $ sudo cp *.a /usr/lib
+
+The configure script will detect the presence of the library and use that
+instead.
+
+Note that you need to have super user privileges to execute these commands.
+On most distros, you can prefix the commands above with `sudo` to execute
+them as the super user.
+
+Debian installs usually don't have `sudo` installed (Ubuntu however does.)
+Switch users first with `su -`, then run `apt-get`.
+
+Basic Usage
+===========
+
+Within your program, you need to include "gumbo.h" and then issue a call to
+`gumbo_parse`:
+
+```C
+#include "gumbo.h"
+
+int main() {
+  GumboOutput* output = gumbo_parse("<h1>Hello, World!</h1>");
+  // Do stuff with output->root
+  gumbo_destroy_output(&kGumboDefaultOptions, output);
+}
+```
+
+See the API documentation and sample programs for more details.
+
+A note on API/ABI compatibility
+===============================
+
+We'll make a best effort to preserve API compatibility between releases.
+The initial release is a 0.9 (beta) release to solicit comments from early
+adopters, but if no major problems are found with the API, a 1.0 release
+will follow shortly, and the API of that should be considered stable.  If
+changes are necessary, we follow [semantic versioning][].
+
+We make no such guarantees about the ABI, and it's very likely that
+subsequent versions may require a recompile of client code.  For this
+reason, we recommend NOT using Gumbo data structures throughout a program,
+and instead limiting them to a translation layer that picks out whatever
+data is needed from the parse tree and then converts that to persistent
+data structures more appropriate for the application.  The API is
+structured to encourage this use, with a single delete function for the
+whole parse tree, and is not designed with mutation in mind.
+
+Python usage
+============
+
+To install the python bindings, make sure that the
+C library is installed first, and then `sudo python setup.py install` from
+the root of the distro.  This installs a 'gumbo' module; `pydoc gumbo`
+should tell you about it.
+
+Recommended best-practice for Python usage is to use one of the adapters to
+an existing API (personally, I prefer BeautifulSoup) and write your program
+in terms of those.  The raw CTypes bindings should be considered building
+blocks for higher-level libraries and rarely referenced directly.
+
+External Bindings
+=================
+
+The following language bindings are maintained by various contributors in
+other repositories:
+
+* Ruby:
+  * [ruby-gumbo] by Nicolas Martyanoff
+  * [nokogumbo] by Sam Ruby
+* Node.js: [node-gumbo-parser] by Karl Westin
+* D: [gumbo-d] by Christopher Bertels
+* Lua: [lua-gumbo] by Craig Barnes
+* Objective-C:
+  * [ObjectiveGumbo] by Programming Thomas
+  * [OCGumbo] by TracyYih
+* C#: [GumboBindings] by Vladimir Zotov
+* PHP: [GumboPHP] by Paul Preece
+[ruby-gumbo]: https://github.com/galdor/ruby-gumbo
+[nokogumbo]: https://github.com/rubys/nokogumbo
+[node-gumbo-parser]: https://github.com/karlwestin/node-gumbo-parser
+[gumbo-d]: https://github.com/bakkdoor/gumbo-d
+[lua-gumbo]: https://github.com/craigbarnes/lua-gumbo
+[OCGumbo]: https://github.com/tracy-e/OCGumbo
+[ObjectiveGumbo]: https://github.com/programmingthomas/ObjectiveGumbo
+[GumboBindings]: https://github.com/rgripper/GumboBindings
+[GumboPHP]: https://github.com/BipSync/gumbo
+
+[HTML5 parsing algorithm]: http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12
+[HTML5 spec]: http://www.whatwg.org/specs/web-apps/current-work/multipage/
+[html5lib-0.95 tests]: https://github.com/html5lib/html5lib-tests
+[googletest]: https://code.google.com/p/googletest/
+[semantic versioning]: http://semver.org/

+ 44 - 0
gumbo/attribute.c

@@ -0,0 +1,44 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "attribute.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "util.h"
+
+struct GumboInternalParser;
+
+GumboAttribute* gumbo_get_attribute(
+    const GumboVector* attributes, const char* name) {
+  for (int i = 0; i < attributes->length; ++i) {
+    GumboAttribute* attr = attributes->data[i];
+    if (!strcasecmp(attr->name, name)) {
+      return attr;
+    }
+  }
+  return NULL;
+}
+
+void gumbo_destroy_attribute(
+    struct GumboInternalParser* parser, GumboAttribute* attribute) {
+  gumbo_parser_deallocate(parser, (void*) attribute->name);
+  gumbo_parser_deallocate(parser, (void*) attribute->value);
+  gumbo_parser_deallocate(parser, (void*) attribute);
+}

+ 37 - 0
gumbo/attribute.h

@@ -0,0 +1,37 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#ifndef GUMBO_ATTRIBUTE_H_
+#define GUMBO_ATTRIBUTE_H_
+
+#include "gumbo.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+
+// Release the memory used for an GumboAttribute, including the attribute
+// itself.
+void gumbo_destroy_attribute(
+    struct GumboInternalParser* parser, GumboAttribute* attribute);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_ATTRIBUTE_H_

+ 2561 - 0
gumbo/char_ref.c

@@ -0,0 +1,2561 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "char_ref.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <stddef.h>
+#include <string.h>     // Only for debug assertions at present.
+
+#include "error.h"
+#include "string_piece.h"
+#include "utf8.h"
+#include "util.h"
+
+struct GumboInternalParser;
+
+const int kGumboNoChar = -1;
+
+// Table of named character entities, and functions for looking them up.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
+//
+// TODO(jdtang): I'd thought of using more efficient means of this, eg. binary
+// searching the table (which can only be done if we know for sure that there's
+// enough room in the buffer for our memcmps, otherwise we need to fall back on
+// linear search) or compiling the list of named entities to a Ragel state
+// machine.  But I'll start with the simple approach and optimize only if
+// profiling calls for it.  The one concession to efficiency is to store the
+// length of the entity with it, so that we don't need to run a strlen to detect
+// potential buffer overflows.
+typedef struct {
+  const char* name;
+  size_t length;
+  OneOrTwoCodepoints codepoints;
+} NamedCharRef;
+
+#define CHAR_REF(name, codepoint) { name, sizeof(name) - 1, { codepoint, -1 } }
+#define MULTI_CHAR_REF(name, code_point, code_point2) \
+    { name, sizeof(name) - 1, { code_point, code_point2 } }
+
+// Versions with the semicolon must come before versions without the semicolon,
+// otherwise they'll match the invalid name first and record a parse error.
+// TODO(jdtang): Replace with a FSM that'll do longest-match-first and probably
+// give better performance besides.
+static const NamedCharRef kNamedEntities[] = {
+  CHAR_REF("AElig", 0xc6),
+  CHAR_REF("AMP;", 0x26),
+  CHAR_REF("AMP", 0x26),
+  CHAR_REF("Aacute;", 0xc1),
+  CHAR_REF("Aacute", 0xc1),
+  CHAR_REF("Abreve;", 0x0102),
+  CHAR_REF("Acirc;", 0xc2),
+  CHAR_REF("Acirc", 0xc2),
+  CHAR_REF("Acy;", 0x0410),
+  CHAR_REF("Afr;", 0x0001d504),
+  CHAR_REF("Agrave", 0xc0),
+  CHAR_REF("Agrave;", 0xc0),
+  CHAR_REF("Alpha;", 0x0391),
+  CHAR_REF("Amacr;", 0x0100),
+  CHAR_REF("And;", 0x2a53),
+  CHAR_REF("Aogon;", 0x0104),
+  CHAR_REF("Aopf;", 0x0001d538),
+  CHAR_REF("ApplyFunction;", 0x2061),
+  CHAR_REF("Aring;", 0xc5),
+  CHAR_REF("Aring", 0xc5),
+  CHAR_REF("Ascr;", 0x0001d49c),
+  CHAR_REF("Assign;", 0x2254),
+  CHAR_REF("Atilde;", 0xc3),
+  CHAR_REF("Atilde", 0xc3),
+  CHAR_REF("Auml;", 0xc4),
+  CHAR_REF("Auml", 0xc4),
+  CHAR_REF("Backslash;", 0x2216),
+  CHAR_REF("Barv;", 0x2ae7),
+  CHAR_REF("Barwed;", 0x2306),
+  CHAR_REF("Bcy;", 0x0411),
+  CHAR_REF("Because;", 0x2235),
+  CHAR_REF("Bernoullis;", 0x212c),
+  CHAR_REF("Beta;", 0x0392),
+  CHAR_REF("Bfr;", 0x0001d505),
+  CHAR_REF("Bopf;", 0x0001d539),
+  CHAR_REF("Breve;", 0x02d8),
+  CHAR_REF("Bscr;", 0x212c),
+  CHAR_REF("Bumpeq;", 0x224e),
+  CHAR_REF("CHcy;", 0x0427),
+  CHAR_REF("COPY;", 0xa9),
+  CHAR_REF("COPY", 0xa9),
+  CHAR_REF("Cacute;", 0x0106),
+  CHAR_REF("Cap;", 0x22d2),
+  CHAR_REF("CapitalDifferentialD;", 0x2145),
+  CHAR_REF("Cayleys;", 0x212d),
+  CHAR_REF("Ccaron;", 0x010c),
+  CHAR_REF("Ccedil;", 0xc7),
+  CHAR_REF("Ccedil", 0xc7),
+  CHAR_REF("Ccirc;", 0x0108),
+  CHAR_REF("Cconint;", 0x2230),
+  CHAR_REF("Cdot;", 0x010a),
+  CHAR_REF("Cedilla;", 0xb8),
+  CHAR_REF("CenterDot;", 0xb7),
+  CHAR_REF("Cfr;", 0x212d),
+  CHAR_REF("Chi;", 0x03a7),
+  CHAR_REF("CircleDot;", 0x2299),
+  CHAR_REF("CircleMinus;", 0x2296),
+  CHAR_REF("CirclePlus;", 0x2295),
+  CHAR_REF("CircleTimes;", 0x2297),
+  CHAR_REF("ClockwiseContourIntegral;", 0x2232),
+  CHAR_REF("CloseCurlyDoubleQuote;", 0x201d),
+  CHAR_REF("CloseCurlyQuote;", 0x2019),
+  CHAR_REF("Colon;", 0x2237),
+  CHAR_REF("Colone;", 0x2a74),
+  CHAR_REF("Congruent;", 0x2261),
+  CHAR_REF("Conint;", 0x222f),
+  CHAR_REF("ContourIntegral;", 0x222e),
+  CHAR_REF("Copf;", 0x2102),
+  CHAR_REF("Coproduct;", 0x2210),
+  CHAR_REF("CounterClockwiseContourIntegral;", 0x2233),
+  CHAR_REF("Cross;", 0x2a2f),
+  CHAR_REF("Cscr;", 0x0001d49e),
+  CHAR_REF("Cup;", 0x22d3),
+  CHAR_REF("CupCap;", 0x224d),
+  CHAR_REF("DD;", 0x2145),
+  CHAR_REF("DDotrahd;", 0x2911),
+  CHAR_REF("DJcy;", 0x0402),
+  CHAR_REF("DScy;", 0x0405),
+  CHAR_REF("DZcy;", 0x040f),
+  CHAR_REF("Dagger;", 0x2021),
+  CHAR_REF("Darr;", 0x21a1),
+  CHAR_REF("Dashv;", 0x2ae4),
+  CHAR_REF("Dcaron;", 0x010e),
+  CHAR_REF("Dcy;", 0x0414),
+  CHAR_REF("Del;", 0x2207),
+  CHAR_REF("Delta;", 0x0394),
+  CHAR_REF("Dfr;", 0x0001d507),
+  CHAR_REF("DiacriticalAcute;", 0xb4),
+  CHAR_REF("DiacriticalDot;", 0x02d9),
+  CHAR_REF("DiacriticalDoubleAcute;", 0x02dd),
+  CHAR_REF("DiacriticalGrave;", 0x60),
+  CHAR_REF("DiacriticalTilde;", 0x02dc),
+  CHAR_REF("Diamond;", 0x22c4),
+  CHAR_REF("DifferentialD;", 0x2146),
+  CHAR_REF("Dopf;", 0x0001d53b),
+  CHAR_REF("Dot;", 0xa8),
+  CHAR_REF("DotDot;", 0x20dc),
+  CHAR_REF("DotEqual;", 0x2250),
+  CHAR_REF("DoubleContourIntegral;", 0x222f),
+  CHAR_REF("DoubleDot;", 0xa8),
+  CHAR_REF("DoubleDownArrow;", 0x21d3),
+  CHAR_REF("DoubleLeftArrow;", 0x21d0),
+  CHAR_REF("DoubleLeftRightArrow;", 0x21d4),
+  CHAR_REF("DoubleLeftTee;", 0x2ae4),
+  CHAR_REF("DoubleLongLeftArrow;", 0x27f8),
+  CHAR_REF("DoubleLongLeftRightArrow;", 0x27fa),
+  CHAR_REF("DoubleLongRightArrow;", 0x27f9),
+  CHAR_REF("DoubleRightArrow;", 0x21d2),
+  CHAR_REF("DoubleRightTee;", 0x22a8),
+  CHAR_REF("DoubleUpArrow;", 0x21d1),
+  CHAR_REF("DoubleUpDownArrow;", 0x21d5),
+  CHAR_REF("DoubleVerticalBar;", 0x2225),
+  CHAR_REF("DownArrow;", 0x2193),
+  CHAR_REF("DownArrowBar;", 0x2913),
+  CHAR_REF("DownArrowUpArrow;", 0x21f5),
+  CHAR_REF("DownBreve;", 0x0311),
+  CHAR_REF("DownLeftRightVector;", 0x2950),
+  CHAR_REF("DownLeftTeeVector;", 0x295e),
+  CHAR_REF("DownLeftVector;", 0x21bd),
+  CHAR_REF("DownLeftVectorBar;", 0x2956),
+  CHAR_REF("DownRightTeeVector;", 0x295f),
+  CHAR_REF("DownRightVector;", 0x21c1),
+  CHAR_REF("DownRightVectorBar;", 0x2957),
+  CHAR_REF("DownTee;", 0x22a4),
+  CHAR_REF("DownTeeArrow;", 0x21a7),
+  CHAR_REF("Downarrow;", 0x21d3),
+  CHAR_REF("Dscr;", 0x0001d49f),
+  CHAR_REF("Dstrok;", 0x0110),
+  CHAR_REF("ENG;", 0x014a),
+  CHAR_REF("ETH;", 0xd0),
+  CHAR_REF("ETH", 0xd0),
+  CHAR_REF("Eacute;", 0xc9),
+  CHAR_REF("Eacute", 0xc9),
+  CHAR_REF("Ecaron;", 0x011a),
+  CHAR_REF("Ecirc;", 0xca),
+  CHAR_REF("Ecirc", 0xca),
+  CHAR_REF("Ecy;", 0x042d),
+  CHAR_REF("Edot;", 0x0116),
+  CHAR_REF("Efr;", 0x0001d508),
+  CHAR_REF("Egrave;", 0xc8),
+  CHAR_REF("Egrave", 0xc8),
+  CHAR_REF("Element;", 0x2208),
+  CHAR_REF("Emacr;", 0x0112),
+  CHAR_REF("EmptySmallSquare;", 0x25fb),
+  CHAR_REF("EmptyVerySmallSquare;", 0x25ab),
+  CHAR_REF("Eogon;", 0x0118),
+  CHAR_REF("Eopf;", 0x0001d53c),
+  CHAR_REF("Epsilon;", 0x0395),
+  CHAR_REF("Equal;", 0x2a75),
+  CHAR_REF("EqualTilde;", 0x2242),
+  CHAR_REF("Equilibrium;", 0x21cc),
+  CHAR_REF("Escr;", 0x2130),
+  CHAR_REF("Esim;", 0x2a73),
+  CHAR_REF("Eta;", 0x0397),
+  CHAR_REF("Euml;", 0xcb),
+  CHAR_REF("Euml", 0xcb),
+  CHAR_REF("Exists;", 0x2203),
+  CHAR_REF("ExponentialE;", 0x2147),
+  CHAR_REF("Fcy;", 0x0424),
+  CHAR_REF("Ffr;", 0x0001d509),
+  CHAR_REF("FilledSmallSquare;", 0x25fc),
+  CHAR_REF("FilledVerySmallSquare;", 0x25aa),
+  CHAR_REF("Fopf;", 0x0001d53d),
+  CHAR_REF("ForAll;", 0x2200),
+  CHAR_REF("Fouriertrf;", 0x2131),
+  CHAR_REF("Fscr;", 0x2131),
+  CHAR_REF("GJcy;", 0x0403),
+  CHAR_REF("GT;", 0x3e),
+  CHAR_REF("GT", 0x3e),
+  CHAR_REF("Gamma;", 0x0393),
+  CHAR_REF("Gammad;", 0x03dc),
+  CHAR_REF("Gbreve;", 0x011e),
+  CHAR_REF("Gcedil;", 0x0122),
+  CHAR_REF("Gcirc;", 0x011c),
+  CHAR_REF("Gcy;", 0x0413),
+  CHAR_REF("Gdot;", 0x0120),
+  CHAR_REF("Gfr;", 0x0001d50a),
+  CHAR_REF("Gg;", 0x22d9),
+  CHAR_REF("Gopf;", 0x0001d53e),
+  CHAR_REF("GreaterEqual;", 0x2265),
+  CHAR_REF("GreaterEqualLess;", 0x22db),
+  CHAR_REF("GreaterFullEqual;", 0x2267),
+  CHAR_REF("GreaterGreater;", 0x2aa2),
+  CHAR_REF("GreaterLess;", 0x2277),
+  CHAR_REF("GreaterSlantEqual;", 0x2a7e),
+  CHAR_REF("GreaterTilde;", 0x2273),
+  CHAR_REF("Gscr;", 0x0001d4a2),
+  CHAR_REF("Gt;", 0x226b),
+  CHAR_REF("HARDcy;", 0x042a),
+  CHAR_REF("Hacek;", 0x02c7),
+  CHAR_REF("Hat;", 0x5e),
+  CHAR_REF("Hcirc;", 0x0124),
+  CHAR_REF("Hfr;", 0x210c),
+  CHAR_REF("HilbertSpace;", 0x210b),
+  CHAR_REF("Hopf;", 0x210d),
+  CHAR_REF("HorizontalLine;", 0x2500),
+  CHAR_REF("Hscr;", 0x210b),
+  CHAR_REF("Hstrok;", 0x0126),
+  CHAR_REF("HumpDownHump;", 0x224e),
+  CHAR_REF("HumpEqual;", 0x224f),
+  CHAR_REF("IEcy;", 0x0415),
+  CHAR_REF("IJlig;", 0x0132),
+  CHAR_REF("IOcy;", 0x0401),
+  CHAR_REF("Iacute;", 0xcd),
+  CHAR_REF("Iacute", 0xcd),
+  CHAR_REF("Icirc;", 0xce),
+  CHAR_REF("Icirc", 0xce),
+  CHAR_REF("Icy;", 0x0418),
+  CHAR_REF("Idot;", 0x0130),
+  CHAR_REF("Ifr;", 0x2111),
+  CHAR_REF("Igrave;", 0xcc),
+  CHAR_REF("Igrave", 0xcc),
+  CHAR_REF("Im;", 0x2111),
+  CHAR_REF("Imacr;", 0x012a),
+  CHAR_REF("ImaginaryI;", 0x2148),
+  CHAR_REF("Implies;", 0x21d2),
+  CHAR_REF("Int;", 0x222c),
+  CHAR_REF("Integral;", 0x222b),
+  CHAR_REF("Intersection;", 0x22c2),
+  CHAR_REF("InvisibleComma;", 0x2063),
+  CHAR_REF("InvisibleTimes;", 0x2062),
+  CHAR_REF("Iogon;", 0x012e),
+  CHAR_REF("Iopf;", 0x0001d540),
+  CHAR_REF("Iota;", 0x0399),
+  CHAR_REF("Iscr;", 0x2110),
+  CHAR_REF("Itilde;", 0x0128),
+  CHAR_REF("Iukcy;", 0x0406),
+  CHAR_REF("Iuml;", 0xcf),
+  CHAR_REF("Iuml", 0xcf),
+  CHAR_REF("Jcirc;", 0x0134),
+  CHAR_REF("Jcy;", 0x0419),
+  CHAR_REF("Jfr;", 0x0001d50d),
+  CHAR_REF("Jopf;", 0x0001d541),
+  CHAR_REF("Jscr;", 0x0001d4a5),
+  CHAR_REF("Jsercy;", 0x0408),
+  CHAR_REF("Jukcy;", 0x0404),
+  CHAR_REF("KHcy;", 0x0425),
+  CHAR_REF("KJcy;", 0x040c),
+  CHAR_REF("Kappa;", 0x039a),
+  CHAR_REF("Kcedil;", 0x0136),
+  CHAR_REF("Kcy;", 0x041a),
+  CHAR_REF("Kfr;", 0x0001d50e),
+  CHAR_REF("Kopf;", 0x0001d542),
+  CHAR_REF("Kscr;", 0x0001d4a6),
+  CHAR_REF("LJcy;", 0x0409),
+  CHAR_REF("LT;", 0x3c),
+  CHAR_REF("LT", 0x3c),
+  CHAR_REF("Lacute;", 0x0139),
+  CHAR_REF("Lambda;", 0x039b),
+  CHAR_REF("Lang;", 0x27ea),
+  CHAR_REF("Laplacetrf;", 0x2112),
+  CHAR_REF("Larr;", 0x219e),
+  CHAR_REF("Lcaron;", 0x013d),
+  CHAR_REF("Lcedil;", 0x013b),
+  CHAR_REF("Lcy;", 0x041b),
+  CHAR_REF("LeftAngleBracket;", 0x27e8),
+  CHAR_REF("LeftArrow;", 0x2190),
+  CHAR_REF("LeftArrowBar;", 0x21e4),
+  CHAR_REF("LeftArrowRightArrow;", 0x21c6),
+  CHAR_REF("LeftCeiling;", 0x2308),
+  CHAR_REF("LeftDoubleBracket;", 0x27e6),
+  CHAR_REF("LeftDownTeeVector;", 0x2961),
+  CHAR_REF("LeftDownVector;", 0x21c3),
+  CHAR_REF("LeftDownVectorBar;", 0x2959),
+  CHAR_REF("LeftFloor;", 0x230a),
+  CHAR_REF("LeftRightArrow;", 0x2194),
+  CHAR_REF("LeftRightVector;", 0x294e),
+  CHAR_REF("LeftTee;", 0x22a3),
+  CHAR_REF("LeftTeeArrow;", 0x21a4),
+  CHAR_REF("LeftTeeVector;", 0x295a),
+  CHAR_REF("LeftTriangle;", 0x22b2),
+  CHAR_REF("LeftTriangleBar;", 0x29cf),
+  CHAR_REF("LeftTriangleEqual;", 0x22b4),
+  CHAR_REF("LeftUpDownVector;", 0x2951),
+  CHAR_REF("LeftUpTeeVector;", 0x2960),
+  CHAR_REF("LeftUpVector;", 0x21bf),
+  CHAR_REF("LeftUpVectorBar;", 0x2958),
+  CHAR_REF("LeftVector;", 0x21bc),
+  CHAR_REF("LeftVectorBar;", 0x2952),
+  CHAR_REF("Leftarrow;", 0x21d0),
+  CHAR_REF("Leftrightarrow;", 0x21d4),
+  CHAR_REF("LessEqualGreater;", 0x22da),
+  CHAR_REF("LessFullEqual;", 0x2266),
+  CHAR_REF("LessGreater;", 0x2276),
+  CHAR_REF("LessLess;", 0x2aa1),
+  CHAR_REF("LessSlantEqual;", 0x2a7d),
+  CHAR_REF("LessTilde;", 0x2272),
+  CHAR_REF("Lfr;", 0x0001d50f),
+  CHAR_REF("Ll;", 0x22d8),
+  CHAR_REF("Lleftarrow;", 0x21da),
+  CHAR_REF("Lmidot;", 0x013f),
+  CHAR_REF("LongLeftArrow;", 0x27f5),
+  CHAR_REF("LongLeftRightArrow;", 0x27f7),
+  CHAR_REF("LongRightArrow;", 0x27f6),
+  CHAR_REF("Longleftarrow;", 0x27f8),
+  CHAR_REF("Longleftrightarrow;", 0x27fa),
+  CHAR_REF("Longrightarrow;", 0x27f9),
+  CHAR_REF("Lopf;", 0x0001d543),
+  CHAR_REF("LowerLeftArrow;", 0x2199),
+  CHAR_REF("LowerRightArrow;", 0x2198),
+  CHAR_REF("Lscr;", 0x2112),
+  CHAR_REF("Lsh;", 0x21b0),
+  CHAR_REF("Lstrok;", 0x0141),
+  CHAR_REF("Lt;", 0x226a),
+  CHAR_REF("Map;", 0x2905),
+  CHAR_REF("Mcy;", 0x041c),
+  CHAR_REF("MediumSpace;", 0x205f),
+  CHAR_REF("Mellintrf;", 0x2133),
+  CHAR_REF("Mfr;", 0x0001d510),
+  CHAR_REF("MinusPlus;", 0x2213),
+  CHAR_REF("Mopf;", 0x0001d544),
+  CHAR_REF("Mscr;", 0x2133),
+  CHAR_REF("Mu;", 0x039c),
+  CHAR_REF("NJcy;", 0x040a),
+  CHAR_REF("Nacute;", 0x0143),
+  CHAR_REF("Ncaron;", 0x0147),
+  CHAR_REF("Ncedil;", 0x0145),
+  CHAR_REF("Ncy;", 0x041d),
+  CHAR_REF("NegativeMediumSpace;", 0x200b),
+  CHAR_REF("NegativeThickSpace;", 0x200b),
+  CHAR_REF("NegativeThinSpace;", 0x200b),
+  CHAR_REF("NegativeVeryThinSpace;", 0x200b),
+  CHAR_REF("NestedGreaterGreater;", 0x226b),
+  CHAR_REF("NestedLessLess;", 0x226a),
+  CHAR_REF("NewLine;", 0x0a),
+  CHAR_REF("Nfr;", 0x0001d511),
+  CHAR_REF("NoBreak;", 0x2060),
+  CHAR_REF("NonBreakingSpace;", 0xa0),
+  CHAR_REF("Nopf;", 0x2115),
+  CHAR_REF("Not;", 0x2aec),
+  CHAR_REF("NotCongruent;", 0x2262),
+  CHAR_REF("NotCupCap;", 0x226d),
+  CHAR_REF("NotDoubleVerticalBar;", 0x2226),
+  CHAR_REF("NotElement;", 0x2209),
+  CHAR_REF("NotEqual;", 0x2260),
+  MULTI_CHAR_REF("NotEqualTilde;", 0x2242, 0x0338),
+  CHAR_REF("NotExists;", 0x2204),
+  CHAR_REF("NotGreater;", 0x226f),
+  CHAR_REF("NotGreaterEqual;", 0x2271),
+  MULTI_CHAR_REF("NotGreaterFullEqual;", 0x2267, 0x0338),
+  MULTI_CHAR_REF("NotGreaterGreater;", 0x226b, 0x0338),
+  CHAR_REF("NotGreaterLess;", 0x2279),
+  MULTI_CHAR_REF("NotGreaterSlantEqual;", 0x2a7e, 0x0338),
+  CHAR_REF("NotGreaterTilde;", 0x2275),
+  MULTI_CHAR_REF("NotHumpDownHump;", 0x224e, 0x0338),
+  MULTI_CHAR_REF("NotHumpEqual;", 0x224f, 0x0338),
+  CHAR_REF("NotLeftTriangle;", 0x22ea),
+  MULTI_CHAR_REF("NotLeftTriangleBar;", 0x29cf, 0x0338),
+  CHAR_REF("NotLeftTriangleEqual;", 0x22ec),
+  CHAR_REF("NotLess;", 0x226e),
+  CHAR_REF("NotLessEqual;", 0x2270),
+  CHAR_REF("NotLessGreater;", 0x2278),
+  MULTI_CHAR_REF("NotLessLess;", 0x226a, 0x0338),
+  MULTI_CHAR_REF("NotLessSlantEqual;", 0x2a7d, 0x0338),
+  CHAR_REF("NotLessTilde;", 0x2274),
+  MULTI_CHAR_REF("NotNestedGreaterGreater;", 0x2aa2, 0x0338),
+  MULTI_CHAR_REF("NotNestedLessLess;", 0x2aa1, 0x0338),
+  CHAR_REF("NotPrecedes;", 0x2280),
+  MULTI_CHAR_REF("NotPrecedesEqual;", 0x2aaf, 0x0338),
+  CHAR_REF("NotPrecedesSlantEqual;", 0x22e0),
+  CHAR_REF("NotReverseElement;", 0x220c),
+  CHAR_REF("NotRightTriangle;", 0x22eb),
+  MULTI_CHAR_REF("NotRightTriangleBar;", 0x29d0, 0x0338),
+  CHAR_REF("NotRightTriangleEqual;", 0x22ed),
+  MULTI_CHAR_REF("NotSquareSubset;", 0x228f, 0x0338),
+  CHAR_REF("NotSquareSubsetEqual;", 0x22e2),
+  MULTI_CHAR_REF("NotSquareSuperset;", 0x2290, 0x0338),
+  CHAR_REF("NotSquareSupersetEqual;", 0x22e3),
+  MULTI_CHAR_REF("NotSubset;", 0x2282, 0x20d2),
+  CHAR_REF("NotSubsetEqual;", 0x2288),
+  CHAR_REF("NotSucceeds;", 0x2281),
+  MULTI_CHAR_REF("NotSucceedsEqual;", 0x2ab0, 0x0338),
+  CHAR_REF("NotSucceedsSlantEqual;", 0x22e1),
+  MULTI_CHAR_REF("NotSucceedsTilde;", 0x227f, 0x0338),
+  MULTI_CHAR_REF("NotSuperset;", 0x2283, 0x20d2),
+  CHAR_REF("NotSupersetEqual;", 0x2289),
+  CHAR_REF("NotTilde;", 0x2241),
+  CHAR_REF("NotTildeEqual;", 0x2244),
+  CHAR_REF("NotTildeFullEqual;", 0x2247),
+  CHAR_REF("NotTildeTilde;", 0x2249),
+  CHAR_REF("NotVerticalBar;", 0x2224),
+  CHAR_REF("Nscr;", 0x0001d4a9),
+  CHAR_REF("Ntilde;", 0xd1),
+  CHAR_REF("Ntilde", 0xd1),
+  CHAR_REF("Nu;", 0x039d),
+  CHAR_REF("OElig;", 0x0152),
+  CHAR_REF("Oacute;", 0xd3),
+  CHAR_REF("Oacute", 0xd3),
+  CHAR_REF("Ocirc;", 0xd4),
+  CHAR_REF("Ocirc", 0xd4),
+  CHAR_REF("Ocy;", 0x041e),
+  CHAR_REF("Odblac;", 0x0150),
+  CHAR_REF("Ofr;", 0x0001d512),
+  CHAR_REF("Ograve;", 0xd2),
+  CHAR_REF("Ograve", 0xd2),
+  CHAR_REF("Omacr;", 0x014c),
+  CHAR_REF("Omega;", 0x03a9),
+  CHAR_REF("Omicron;", 0x039f),
+  CHAR_REF("Oopf;", 0x0001d546),
+  CHAR_REF("OpenCurlyDoubleQuote;", 0x201c),
+  CHAR_REF("OpenCurlyQuote;", 0x2018),
+  CHAR_REF("Or;", 0x2a54),
+  CHAR_REF("Oscr;", 0x0001d4aa),
+  CHAR_REF("Oslash;", 0xd8),
+  CHAR_REF("Oslash", 0xd8),
+  CHAR_REF("Otilde;", 0xd5),
+  CHAR_REF("Otilde", 0xd5),
+  CHAR_REF("Otimes;", 0x2a37),
+  CHAR_REF("Ouml", 0xd6),
+  CHAR_REF("Ouml;", 0xd6),
+  CHAR_REF("OverBar;", 0x203e),
+  CHAR_REF("OverBrace;", 0x23de),
+  CHAR_REF("OverBracket;", 0x23b4),
+  CHAR_REF("OverParenthesis;", 0x23dc),
+  CHAR_REF("PartialD;", 0x2202),
+  CHAR_REF("Pcy;", 0x041f),
+  CHAR_REF("Pfr;", 0x0001d513),
+  CHAR_REF("Phi;", 0x03a6),
+  CHAR_REF("Pi;", 0x03a0),
+  CHAR_REF("PlusMinus;", 0xb1),
+  CHAR_REF("Poincareplane;", 0x210c),
+  CHAR_REF("Popf;", 0x2119),
+  CHAR_REF("Pr;", 0x2abb),
+  CHAR_REF("Precedes;", 0x227a),
+  CHAR_REF("PrecedesEqual;", 0x2aaf),
+  CHAR_REF("PrecedesSlantEqual;", 0x227c),
+  CHAR_REF("PrecedesTilde;", 0x227e),
+  CHAR_REF("Prime;", 0x2033),
+  CHAR_REF("Product;", 0x220f),
+  CHAR_REF("Proportion;", 0x2237),
+  CHAR_REF("Proportional;", 0x221d),
+  CHAR_REF("Pscr;", 0x0001d4ab),
+  CHAR_REF("Psi;", 0x03a8),
+  CHAR_REF("QUOT;", 0x22),
+  CHAR_REF("QUOT", 0x22),
+  CHAR_REF("Qfr;", 0x0001d514),
+  CHAR_REF("Qopf;", 0x211a),
+  CHAR_REF("Qscr;", 0x0001d4ac),
+  CHAR_REF("RBarr;", 0x2910),
+  CHAR_REF("REG;", 0xae),
+  CHAR_REF("REG", 0xae),
+  CHAR_REF("Racute;", 0x0154),
+  CHAR_REF("Rang;", 0x27eb),
+  CHAR_REF("Rarr;", 0x21a0),
+  CHAR_REF("Rarrtl;", 0x2916),
+  CHAR_REF("Rcaron;", 0x0158),
+  CHAR_REF("Rcedil;", 0x0156),
+  CHAR_REF("Rcy;", 0x0420),
+  CHAR_REF("Re;", 0x211c),
+  CHAR_REF("ReverseElement;", 0x220b),
+  CHAR_REF("ReverseEquilibrium;", 0x21cb),
+  CHAR_REF("ReverseUpEquilibrium;", 0x296f),
+  CHAR_REF("Rfr;", 0x211c),
+  CHAR_REF("Rho;", 0x03a1),
+  CHAR_REF("RightAngleBracket;", 0x27e9),
+  CHAR_REF("RightArrow;", 0x2192),
+  CHAR_REF("RightArrowBar;", 0x21e5),
+  CHAR_REF("RightArrowLeftArrow;", 0x21c4),
+  CHAR_REF("RightCeiling;", 0x2309),
+  CHAR_REF("RightDoubleBracket;", 0x27e7),
+  CHAR_REF("RightDownTeeVector;", 0x295d),
+  CHAR_REF("RightDownVector;", 0x21c2),
+  CHAR_REF("RightDownVectorBar;", 0x2955),
+  CHAR_REF("RightFloor;", 0x230b),
+  CHAR_REF("RightTee;", 0x22a2),
+  CHAR_REF("RightTeeArrow;", 0x21a6),
+  CHAR_REF("RightTeeVector;", 0x295b),
+  CHAR_REF("RightTriangle;", 0x22b3),
+  CHAR_REF("RightTriangleBar;", 0x29d0),
+  CHAR_REF("RightTriangleEqual;", 0x22b5),
+  CHAR_REF("RightUpDownVector;", 0x294f),
+  CHAR_REF("RightUpTeeVector;", 0x295c),
+  CHAR_REF("RightUpVector;", 0x21be),
+  CHAR_REF("RightUpVectorBar;", 0x2954),
+  CHAR_REF("RightVector;", 0x21c0),
+  CHAR_REF("RightVectorBar;", 0x2953),
+  CHAR_REF("Rightarrow;", 0x21d2),
+  CHAR_REF("Ropf;", 0x211d),
+  CHAR_REF("RoundImplies;", 0x2970),
+  CHAR_REF("Rrightarrow;", 0x21db),
+  CHAR_REF("Rscr;", 0x211b),
+  CHAR_REF("Rsh;", 0x21b1),
+  CHAR_REF("RuleDelayed;", 0x29f4),
+  CHAR_REF("SHCHcy;", 0x0429),
+  CHAR_REF("SHcy;", 0x0428),
+  CHAR_REF("SOFTcy;", 0x042c),
+  CHAR_REF("Sacute;", 0x015a),
+  CHAR_REF("Sc;", 0x2abc),
+  CHAR_REF("Scaron;", 0x0160),
+  CHAR_REF("Scedil;", 0x015e),
+  CHAR_REF("Scirc;", 0x015c),
+  CHAR_REF("Scy;", 0x0421),
+  CHAR_REF("Sfr;", 0x0001d516),
+  CHAR_REF("ShortDownArrow;", 0x2193),
+  CHAR_REF("ShortLeftArrow;", 0x2190),
+  CHAR_REF("ShortRightArrow;", 0x2192),
+  CHAR_REF("ShortUpArrow;", 0x2191),
+  CHAR_REF("Sigma;", 0x03a3),
+  CHAR_REF("SmallCircle;", 0x2218),
+  CHAR_REF("Sopf;", 0x0001d54a),
+  CHAR_REF("Sqrt;", 0x221a),
+  CHAR_REF("Square;", 0x25a1),
+  CHAR_REF("SquareIntersection;", 0x2293),
+  CHAR_REF("SquareSubset;", 0x228f),
+  CHAR_REF("SquareSubsetEqual;", 0x2291),
+  CHAR_REF("SquareSuperset;", 0x2290),
+  CHAR_REF("SquareSupersetEqual;", 0x2292),
+  CHAR_REF("SquareUnion;", 0x2294),
+  CHAR_REF("Sscr;", 0x0001d4ae),
+  CHAR_REF("Star;", 0x22c6),
+  CHAR_REF("Sub;", 0x22d0),
+  CHAR_REF("Subset;", 0x22d0),
+  CHAR_REF("SubsetEqual;", 0x2286),
+  CHAR_REF("Succeeds;", 0x227b),
+  CHAR_REF("SucceedsEqual;", 0x2ab0),
+  CHAR_REF("SucceedsSlantEqual;", 0x227d),
+  CHAR_REF("SucceedsTilde;", 0x227f),
+  CHAR_REF("SuchThat;", 0x220b),
+  CHAR_REF("Sum;", 0x2211),
+  CHAR_REF("Sup;", 0x22d1),
+  CHAR_REF("Superset;", 0x2283),
+  CHAR_REF("SupersetEqual;", 0x2287),
+  CHAR_REF("Supset;", 0x22d1),
+  CHAR_REF("THORN;", 0xde),
+  CHAR_REF("THORN", 0xde),
+  CHAR_REF("TRADE;", 0x2122),
+  CHAR_REF("TSHcy;", 0x040b),
+  CHAR_REF("TScy;", 0x0426),
+  CHAR_REF("Tab;", 0x09),
+  CHAR_REF("Tau;", 0x03a4),
+  CHAR_REF("Tcaron;", 0x0164),
+  CHAR_REF("Tcedil;", 0x0162),
+  CHAR_REF("Tcy;", 0x0422),
+  CHAR_REF("Tfr;", 0x0001d517),
+  CHAR_REF("Therefore;", 0x2234),
+  CHAR_REF("Theta;", 0x0398),
+  MULTI_CHAR_REF("ThickSpace;", 0x205f, 0x200a),
+  CHAR_REF("ThinSpace;", 0x2009),
+  CHAR_REF("Tilde;", 0x223c),
+  CHAR_REF("TildeEqual;", 0x2243),
+  CHAR_REF("TildeFullEqual;", 0x2245),
+  CHAR_REF("TildeTilde;", 0x2248),
+  CHAR_REF("Topf;", 0x0001d54b),
+  CHAR_REF("TripleDot;", 0x20db),
+  CHAR_REF("Tscr;", 0x0001d4af),
+  CHAR_REF("Tstrok;", 0x0166),
+  CHAR_REF("Uacute;", 0xda),
+  CHAR_REF("Uacute", 0xda),
+  CHAR_REF("Uarr;", 0x219f),
+  CHAR_REF("Uarrocir;", 0x2949),
+  CHAR_REF("Ubrcy;", 0x040e),
+  CHAR_REF("Ubreve;", 0x016c),
+  CHAR_REF("Ucirc;", 0xdb),
+  CHAR_REF("Ucirc", 0xdb),
+  CHAR_REF("Ucy;", 0x0423),
+  CHAR_REF("Udblac;", 0x0170),
+  CHAR_REF("Ufr;", 0x0001d518),
+  CHAR_REF("Ugrave;", 0xd9),
+  CHAR_REF("Ugrave", 0xd9),
+  CHAR_REF("Umacr;", 0x016a),
+  CHAR_REF("UnderBar;", 0x5f),
+  CHAR_REF("UnderBrace;", 0x23df),
+  CHAR_REF("UnderBracket;", 0x23b5),
+  CHAR_REF("UnderParenthesis;", 0x23dd),
+  CHAR_REF("Union;", 0x22c3),
+  CHAR_REF("UnionPlus;", 0x228e),
+  CHAR_REF("Uogon;", 0x0172),
+  CHAR_REF("Uopf;", 0x0001d54c),
+  CHAR_REF("UpArrow;", 0x2191),
+  CHAR_REF("UpArrowBar;", 0x2912),
+  CHAR_REF("UpArrowDownArrow;", 0x21c5),
+  CHAR_REF("UpDownArrow;", 0x2195),
+  CHAR_REF("UpEquilibrium;", 0x296e),
+  CHAR_REF("UpTee;", 0x22a5),
+  CHAR_REF("UpTeeArrow;", 0x21a5),
+  CHAR_REF("Uparrow;", 0x21d1),
+  CHAR_REF("Updownarrow;", 0x21d5),
+  CHAR_REF("UpperLeftArrow;", 0x2196),
+  CHAR_REF("UpperRightArrow;", 0x2197),
+  CHAR_REF("Upsi;", 0x03d2),
+  CHAR_REF("Upsilon;", 0x03a5),
+  CHAR_REF("Uring;", 0x016e),
+  CHAR_REF("Uscr;", 0x0001d4b0),
+  CHAR_REF("Utilde;", 0x0168),
+  CHAR_REF("Uuml;", 0xdc),
+  CHAR_REF("Uuml", 0xdc),
+  CHAR_REF("VDash;", 0x22ab),
+  CHAR_REF("Vbar;", 0x2aeb),
+  CHAR_REF("Vcy;", 0x0412),
+  CHAR_REF("Vdash;", 0x22a9),
+  CHAR_REF("Vdashl;", 0x2ae6),
+  CHAR_REF("Vee;", 0x22c1),
+  CHAR_REF("Verbar;", 0x2016),
+  CHAR_REF("Vert;", 0x2016),
+  CHAR_REF("VerticalBar;", 0x2223),
+  CHAR_REF("VerticalLine;", 0x7c),
+  CHAR_REF("VerticalSeparator;", 0x2758),
+  CHAR_REF("VerticalTilde;", 0x2240),
+  CHAR_REF("VeryThinSpace;", 0x200a),
+  CHAR_REF("Vfr;", 0x0001d519),
+  CHAR_REF("Vopf;", 0x0001d54d),
+  CHAR_REF("Vscr;", 0x0001d4b1),
+  CHAR_REF("Vvdash;", 0x22aa),
+  CHAR_REF("Wcirc;", 0x0174),
+  CHAR_REF("Wedge;", 0x22c0),
+  CHAR_REF("Wfr;", 0x0001d51a),
+  CHAR_REF("Wopf;", 0x0001d54e),
+  CHAR_REF("Wscr;", 0x0001d4b2),
+  CHAR_REF("Xfr;", 0x0001d51b),
+  CHAR_REF("Xi;", 0x039e),
+  CHAR_REF("Xopf;", 0x0001d54f),
+  CHAR_REF("Xscr;", 0x0001d4b3),
+  CHAR_REF("YAcy;", 0x042f),
+  CHAR_REF("YIcy;", 0x0407),
+  CHAR_REF("YUcy;", 0x042e),
+  CHAR_REF("Yacute", 0xdd),
+  CHAR_REF("Yacute;", 0xdd),
+  CHAR_REF("Ycirc;", 0x0176),
+  CHAR_REF("Ycy;", 0x042b),
+  CHAR_REF("Yfr;", 0x0001d51c),
+  CHAR_REF("Yopf;", 0x0001d550),
+  CHAR_REF("Yscr;", 0x0001d4b4),
+  CHAR_REF("Yuml;", 0x0178),
+  CHAR_REF("ZHcy;", 0x0416),
+  CHAR_REF("Zacute;", 0x0179),
+  CHAR_REF("Zcaron;", 0x017d),
+  CHAR_REF("Zcy;", 0x0417),
+  CHAR_REF("Zdot;", 0x017b),
+  CHAR_REF("ZeroWidthSpace;", 0x200b),
+  CHAR_REF("Zeta;", 0x0396),
+  CHAR_REF("Zfr;", 0x2128),
+  CHAR_REF("Zopf;", 0x2124),
+  CHAR_REF("Zscr;", 0x0001d4b5),
+  CHAR_REF("aacute;", 0xe1),
+  CHAR_REF("aacute", 0xe1),
+  CHAR_REF("abreve;", 0x0103),
+  CHAR_REF("ac;", 0x223e),
+  MULTI_CHAR_REF("acE;", 0x223e, 0x0333),
+  CHAR_REF("acd;", 0x223f),
+  CHAR_REF("acirc;", 0xe2),
+  CHAR_REF("acirc", 0xe2),
+  CHAR_REF("acute;", 0xb4),
+  CHAR_REF("acute", 0xb4),
+  CHAR_REF("acy;", 0x0430),
+  CHAR_REF("aelig;", 0xe6),
+  CHAR_REF("aelig", 0xe6),
+  CHAR_REF("af;", 0x2061),
+  CHAR_REF("afr;", 0x0001d51e),
+  CHAR_REF("agrave;", 0xe0),
+  CHAR_REF("agrave", 0xe0),
+  CHAR_REF("alefsym;", 0x2135),
+  CHAR_REF("aleph;", 0x2135),
+  CHAR_REF("alpha;", 0x03b1),
+  CHAR_REF("amacr;", 0x0101),
+  CHAR_REF("amalg;", 0x2a3f),
+  CHAR_REF("amp;", 0x26),
+  CHAR_REF("amp", 0x26),
+  CHAR_REF("and;", 0x2227),
+  CHAR_REF("andand;", 0x2a55),
+  CHAR_REF("andd;", 0x2a5c),
+  CHAR_REF("andslope;", 0x2a58),
+  CHAR_REF("andv;", 0x2a5a),
+  CHAR_REF("ang;", 0x2220),
+  CHAR_REF("ange;", 0x29a4),
+  CHAR_REF("angle;", 0x2220),
+  CHAR_REF("angmsd;", 0x2221),
+  CHAR_REF("angmsdaa;", 0x29a8),
+  CHAR_REF("angmsdab;", 0x29a9),
+  CHAR_REF("angmsdac;", 0x29aa),
+  CHAR_REF("angmsdad;", 0x29ab),
+  CHAR_REF("angmsdae;", 0x29ac),
+  CHAR_REF("angmsdaf;", 0x29ad),
+  CHAR_REF("angmsdag;", 0x29ae),
+  CHAR_REF("angmsdah;", 0x29af),
+  CHAR_REF("angrt;", 0x221f),
+  CHAR_REF("angrtvb;", 0x22be),
+  CHAR_REF("angrtvbd;", 0x299d),
+  CHAR_REF("angsph;", 0x2222),
+  CHAR_REF("angst;", 0xc5),
+  CHAR_REF("angzarr;", 0x237c),
+  CHAR_REF("aogon;", 0x0105),
+  CHAR_REF("aopf;", 0x0001d552),
+  CHAR_REF("ap;", 0x2248),
+  CHAR_REF("apE;", 0x2a70),
+  CHAR_REF("apacir;", 0x2a6f),
+  CHAR_REF("ape;", 0x224a),
+  CHAR_REF("apid;", 0x224b),
+  CHAR_REF("apos;", 0x27),
+  CHAR_REF("approx;", 0x2248),
+  CHAR_REF("approxeq;", 0x224a),
+  CHAR_REF("aring;", 0xe5),
+  CHAR_REF("aring", 0xe5),
+  CHAR_REF("ascr;", 0x0001d4b6),
+  CHAR_REF("ast;", 0x2a),
+  CHAR_REF("asymp;", 0x2248),
+  CHAR_REF("asympeq;", 0x224d),
+  CHAR_REF("atilde;", 0xe3),
+  CHAR_REF("atilde", 0xe3),
+  CHAR_REF("auml;", 0xe4),
+  CHAR_REF("auml", 0xe4),
+  CHAR_REF("awconint;", 0x2233),
+  CHAR_REF("awint;", 0x2a11),
+  CHAR_REF("bNot;", 0x2aed),
+  CHAR_REF("backcong;", 0x224c),
+  CHAR_REF("backepsilon;", 0x03f6),
+  CHAR_REF("backprime;", 0x2035),
+  CHAR_REF("backsim;", 0x223d),
+  CHAR_REF("backsimeq;", 0x22cd),
+  CHAR_REF("barvee;", 0x22bd),
+  CHAR_REF("barwed;", 0x2305),
+  CHAR_REF("barwedge;", 0x2305),
+  CHAR_REF("bbrk;", 0x23b5),
+  CHAR_REF("bbrktbrk;", 0x23b6),
+  CHAR_REF("bcong;", 0x224c),
+  CHAR_REF("bcy;", 0x0431),
+  CHAR_REF("bdquo;", 0x201e),
+  CHAR_REF("becaus;", 0x2235),
+  CHAR_REF("because;", 0x2235),
+  CHAR_REF("bemptyv;", 0x29b0),
+  CHAR_REF("bepsi;", 0x03f6),
+  CHAR_REF("bernou;", 0x212c),
+  CHAR_REF("beta;", 0x03b2),
+  CHAR_REF("beth;", 0x2136),
+  CHAR_REF("between;", 0x226c),
+  CHAR_REF("bfr;", 0x0001d51f),
+  CHAR_REF("bigcap;", 0x22c2),
+  CHAR_REF("bigcirc;", 0x25ef),
+  CHAR_REF("bigcup;", 0x22c3),
+  CHAR_REF("bigodot;", 0x2a00),
+  CHAR_REF("bigoplus;", 0x2a01),
+  CHAR_REF("bigotimes;", 0x2a02),
+  CHAR_REF("bigsqcup;", 0x2a06),
+  CHAR_REF("bigstar;", 0x2605),
+  CHAR_REF("bigtriangledown;", 0x25bd),
+  CHAR_REF("bigtriangleup;", 0x25b3),
+  CHAR_REF("biguplus;", 0x2a04),
+  CHAR_REF("bigvee;", 0x22c1),
+  CHAR_REF("bigwedge;", 0x22c0),
+  CHAR_REF("bkarow;", 0x290d),
+  CHAR_REF("blacklozenge;", 0x29eb),
+  CHAR_REF("blacksquare;", 0x25aa),
+  CHAR_REF("blacktriangle;", 0x25b4),
+  CHAR_REF("blacktriangledown;", 0x25be),
+  CHAR_REF("blacktriangleleft;", 0x25c2),
+  CHAR_REF("blacktriangleright;", 0x25b8),
+  CHAR_REF("blank;", 0x2423),
+  CHAR_REF("blk12;", 0x2592),
+  CHAR_REF("blk14;", 0x2591),
+  CHAR_REF("blk34;", 0x2593),
+  CHAR_REF("block;", 0x2588),
+  MULTI_CHAR_REF("bne;", 0x3d, 0x20e5),
+  MULTI_CHAR_REF("bnequiv;", 0x2261, 0x20e5),
+  CHAR_REF("bnot;", 0x2310),
+  CHAR_REF("bopf;", 0x0001d553),
+  CHAR_REF("bot;", 0x22a5),
+  CHAR_REF("bottom;", 0x22a5),
+  CHAR_REF("bowtie;", 0x22c8),
+  CHAR_REF("boxDL;", 0x2557),
+  CHAR_REF("boxDR;", 0x2554),
+  CHAR_REF("boxDl;", 0x2556),
+  CHAR_REF("boxDr;", 0x2553),
+  CHAR_REF("boxH;", 0x2550),
+  CHAR_REF("boxHD;", 0x2566),
+  CHAR_REF("boxHU;", 0x2569),
+  CHAR_REF("boxHd;", 0x2564),
+  CHAR_REF("boxHu;", 0x2567),
+  CHAR_REF("boxUL;", 0x255d),
+  CHAR_REF("boxUR;", 0x255a),
+  CHAR_REF("boxUl;", 0x255c),
+  CHAR_REF("boxUr;", 0x2559),
+  CHAR_REF("boxV;", 0x2551),
+  CHAR_REF("boxVH;", 0x256c),
+  CHAR_REF("boxVL;", 0x2563),
+  CHAR_REF("boxVR;", 0x2560),
+  CHAR_REF("boxVh;", 0x256b),
+  CHAR_REF("boxVl;", 0x2562),
+  CHAR_REF("boxVr;", 0x255f),
+  CHAR_REF("boxbox;", 0x29c9),
+  CHAR_REF("boxdL;", 0x2555),
+  CHAR_REF("boxdR;", 0x2552),
+  CHAR_REF("boxdl;", 0x2510),
+  CHAR_REF("boxdr;", 0x250c),
+  CHAR_REF("boxh;", 0x2500),
+  CHAR_REF("boxhD;", 0x2565),
+  CHAR_REF("boxhU;", 0x2568),
+  CHAR_REF("boxhd;", 0x252c),
+  CHAR_REF("boxhu;", 0x2534),
+  CHAR_REF("boxminus;", 0x229f),
+  CHAR_REF("boxplus;", 0x229e),
+  CHAR_REF("boxtimes;", 0x22a0),
+  CHAR_REF("boxuL;", 0x255b),
+  CHAR_REF("boxuR;", 0x2558),
+  CHAR_REF("boxul;", 0x2518),
+  CHAR_REF("boxur;", 0x2514),
+  CHAR_REF("boxv;", 0x2502),
+  CHAR_REF("boxvH;", 0x256a),
+  CHAR_REF("boxvL;", 0x2561),
+  CHAR_REF("boxvR;", 0x255e),
+  CHAR_REF("boxvh;", 0x253c),
+  CHAR_REF("boxvl;", 0x2524),
+  CHAR_REF("boxvr;", 0x251c),
+  CHAR_REF("bprime;", 0x2035),
+  CHAR_REF("breve;", 0x02d8),
+  CHAR_REF("brvbar;", 0xa6),
+  CHAR_REF("brvbar", 0xa6),
+  CHAR_REF("bscr;", 0x0001d4b7),
+  CHAR_REF("bsemi;", 0x204f),
+  CHAR_REF("bsim;", 0x223d),
+  CHAR_REF("bsime;", 0x22cd),
+  CHAR_REF("bsol;", 0x5c),
+  CHAR_REF("bsolb;", 0x29c5),
+  CHAR_REF("bsolhsub;", 0x27c8),
+  CHAR_REF("bull;", 0x2022),
+  CHAR_REF("bullet;", 0x2022),
+  CHAR_REF("bump;", 0x224e),
+  CHAR_REF("bumpE;", 0x2aae),
+  CHAR_REF("bumpe;", 0x224f),
+  CHAR_REF("bumpeq;", 0x224f),
+  CHAR_REF("cacute;", 0x0107),
+  CHAR_REF("cap;", 0x2229),
+  CHAR_REF("capand;", 0x2a44),
+  CHAR_REF("capbrcup;", 0x2a49),
+  CHAR_REF("capcap;", 0x2a4b),
+  CHAR_REF("capcup;", 0x2a47),
+  CHAR_REF("capdot;", 0x2a40),
+  MULTI_CHAR_REF("caps;", 0x2229, 0xfe00),
+  CHAR_REF("caret;", 0x2041),
+  CHAR_REF("caron;", 0x02c7),
+  CHAR_REF("ccaps;", 0x2a4d),
+  CHAR_REF("ccaron;", 0x010d),
+  CHAR_REF("ccedil;", 0xe7),
+  CHAR_REF("ccedil", 0xe7),
+  CHAR_REF("ccirc;", 0x0109),
+  CHAR_REF("ccups;", 0x2a4c),
+  CHAR_REF("ccupssm;", 0x2a50),
+  CHAR_REF("cdot;", 0x010b),
+  CHAR_REF("cedil;", 0xb8),
+  CHAR_REF("cedil", 0xb8),
+  CHAR_REF("cemptyv;", 0x29b2),
+  CHAR_REF("cent;", 0xa2),
+  CHAR_REF("cent", 0xa2),
+  CHAR_REF("centerdot;", 0xb7),
+  CHAR_REF("cfr;", 0x0001d520),
+  CHAR_REF("chcy;", 0x0447),
+  CHAR_REF("check;", 0x2713),
+  CHAR_REF("checkmark;", 0x2713),
+  CHAR_REF("chi;", 0x03c7),
+  CHAR_REF("cir;", 0x25cb),
+  CHAR_REF("cirE;", 0x29c3),
+  CHAR_REF("circ;", 0x02c6),
+  CHAR_REF("circeq;", 0x2257),
+  CHAR_REF("circlearrowleft;", 0x21ba),
+  CHAR_REF("circlearrowright;", 0x21bb),
+  CHAR_REF("circledR;", 0xae),
+  CHAR_REF("circledS;", 0x24c8),
+  CHAR_REF("circledast;", 0x229b),
+  CHAR_REF("circledcirc;", 0x229a),
+  CHAR_REF("circleddash;", 0x229d),
+  CHAR_REF("cire;", 0x2257),
+  CHAR_REF("cirfnint;", 0x2a10),
+  CHAR_REF("cirmid;", 0x2aef),
+  CHAR_REF("cirscir;", 0x29c2),
+  CHAR_REF("clubs;", 0x2663),
+  CHAR_REF("clubsuit;", 0x2663),
+  CHAR_REF("colon;", 0x3a),
+  CHAR_REF("colone;", 0x2254),
+  CHAR_REF("coloneq;", 0x2254),
+  CHAR_REF("comma;", 0x2c),
+  CHAR_REF("commat;", 0x40),
+  CHAR_REF("comp;", 0x2201),
+  CHAR_REF("compfn;", 0x2218),
+  CHAR_REF("complement;", 0x2201),
+  CHAR_REF("complexes;", 0x2102),
+  CHAR_REF("cong;", 0x2245),
+  CHAR_REF("congdot;", 0x2a6d),
+  CHAR_REF("conint;", 0x222e),
+  CHAR_REF("copf;", 0x0001d554),
+  CHAR_REF("coprod;", 0x2210),
+  CHAR_REF("copy;", 0xa9),
+  CHAR_REF("copy", 0xa9),
+  CHAR_REF("copysr;", 0x2117),
+  CHAR_REF("crarr;", 0x21b5),
+  CHAR_REF("cross;", 0x2717),
+  CHAR_REF("cscr;", 0x0001d4b8),
+  CHAR_REF("csub;", 0x2acf),
+  CHAR_REF("csube;", 0x2ad1),
+  CHAR_REF("csup;", 0x2ad0),
+  CHAR_REF("csupe;", 0x2ad2),
+  CHAR_REF("ctdot;", 0x22ef),
+  CHAR_REF("cudarrl;", 0x2938),
+  CHAR_REF("cudarrr;", 0x2935),
+  CHAR_REF("cuepr;", 0x22de),
+  CHAR_REF("cuesc;", 0x22df),
+  CHAR_REF("cularr;", 0x21b6),
+  CHAR_REF("cularrp;", 0x293d),
+  CHAR_REF("cup;", 0x222a),
+  CHAR_REF("cupbrcap;", 0x2a48),
+  CHAR_REF("cupcap;", 0x2a46),
+  CHAR_REF("cupcup;", 0x2a4a),
+  CHAR_REF("cupdot;", 0x228d),
+  CHAR_REF("cupor;", 0x2a45),
+  MULTI_CHAR_REF("cups;", 0x222a, 0xfe00),
+  CHAR_REF("curarr;", 0x21b7),
+  CHAR_REF("curarrm;", 0x293c),
+  CHAR_REF("curlyeqprec;", 0x22de),
+  CHAR_REF("curlyeqsucc;", 0x22df),
+  CHAR_REF("curlyvee;", 0x22ce),
+  CHAR_REF("curlywedge;", 0x22cf),
+  CHAR_REF("curren;", 0xa4),
+  CHAR_REF("curren", 0xa4),
+  CHAR_REF("curvearrowleft;", 0x21b6),
+  CHAR_REF("curvearrowright;", 0x21b7),
+  CHAR_REF("cuvee;", 0x22ce),
+  CHAR_REF("cuwed;", 0x22cf),
+  CHAR_REF("cwconint;", 0x2232),
+  CHAR_REF("cwint;", 0x2231),
+  CHAR_REF("cylcty;", 0x232d),
+  CHAR_REF("dArr;", 0x21d3),
+  CHAR_REF("dHar;", 0x2965),
+  CHAR_REF("dagger;", 0x2020),
+  CHAR_REF("daleth;", 0x2138),
+  CHAR_REF("darr;", 0x2193),
+  CHAR_REF("dash;", 0x2010),
+  CHAR_REF("dashv;", 0x22a3),
+  CHAR_REF("dbkarow;", 0x290f),
+  CHAR_REF("dblac;", 0x02dd),
+  CHAR_REF("dcaron;", 0x010f),
+  CHAR_REF("dcy;", 0x0434),
+  CHAR_REF("dd;", 0x2146),
+  CHAR_REF("ddagger;", 0x2021),
+  CHAR_REF("ddarr;", 0x21ca),
+  CHAR_REF("ddotseq;", 0x2a77),
+  CHAR_REF("deg;", 0xb0),
+  CHAR_REF("deg", 0xb0),
+  CHAR_REF("delta;", 0x03b4),
+  CHAR_REF("demptyv;", 0x29b1),
+  CHAR_REF("dfisht;", 0x297f),
+  CHAR_REF("dfr;", 0x0001d521),
+  CHAR_REF("dharl;", 0x21c3),
+  CHAR_REF("dharr;", 0x21c2),
+  CHAR_REF("diam;", 0x22c4),
+  CHAR_REF("diamond;", 0x22c4),
+  CHAR_REF("diamondsuit;", 0x2666),
+  CHAR_REF("diams;", 0x2666),
+  CHAR_REF("die;", 0xa8),
+  CHAR_REF("digamma;", 0x03dd),
+  CHAR_REF("disin;", 0x22f2),
+  CHAR_REF("div;", 0xf7),
+  CHAR_REF("divide;", 0xf7),
+  CHAR_REF("divide", 0xf7),
+  CHAR_REF("divideontimes;", 0x22c7),
+  CHAR_REF("divonx;", 0x22c7),
+  CHAR_REF("djcy;", 0x0452),
+  CHAR_REF("dlcorn;", 0x231e),
+  CHAR_REF("dlcrop;", 0x230d),
+  CHAR_REF("dollar;", 0x24),
+  CHAR_REF("dopf;", 0x0001d555),
+  CHAR_REF("dot;", 0x02d9),
+  CHAR_REF("doteq;", 0x2250),
+  CHAR_REF("doteqdot;", 0x2251),
+  CHAR_REF("dotminus;", 0x2238),
+  CHAR_REF("dotplus;", 0x2214),
+  CHAR_REF("dotsquare;", 0x22a1),
+  CHAR_REF("doublebarwedge;", 0x2306),
+  CHAR_REF("downarrow;", 0x2193),
+  CHAR_REF("downdownarrows;", 0x21ca),
+  CHAR_REF("downharpoonleft;", 0x21c3),
+  CHAR_REF("downharpoonright;", 0x21c2),
+  CHAR_REF("drbkarow;", 0x2910),
+  CHAR_REF("drcorn;", 0x231f),
+  CHAR_REF("drcrop;", 0x230c),
+  CHAR_REF("dscr;", 0x0001d4b9),
+  CHAR_REF("dscy;", 0x0455),
+  CHAR_REF("dsol;", 0x29f6),
+  CHAR_REF("dstrok;", 0x0111),
+  CHAR_REF("dtdot;", 0x22f1),
+  CHAR_REF("dtri;", 0x25bf),
+  CHAR_REF("dtrif;", 0x25be),
+  CHAR_REF("duarr;", 0x21f5),
+  CHAR_REF("duhar;", 0x296f),
+  CHAR_REF("dwangle;", 0x29a6),
+  CHAR_REF("dzcy;", 0x045f),
+  CHAR_REF("dzigrarr;", 0x27ff),
+  CHAR_REF("eDDot;", 0x2a77),
+  CHAR_REF("eDot;", 0x2251),
+  CHAR_REF("eacute;", 0xe9),
+  CHAR_REF("eacute", 0xe9),
+  CHAR_REF("easter;", 0x2a6e),
+  CHAR_REF("ecaron;", 0x011b),
+  CHAR_REF("ecir;", 0x2256),
+  CHAR_REF("ecirc;", 0xea),
+  CHAR_REF("ecirc", 0xea),
+  CHAR_REF("ecolon;", 0x2255),
+  CHAR_REF("ecy;", 0x044d),
+  CHAR_REF("edot;", 0x0117),
+  CHAR_REF("ee;", 0x2147),
+  CHAR_REF("efDot;", 0x2252),
+  CHAR_REF("efr;", 0x0001d522),
+  CHAR_REF("eg;", 0x2a9a),
+  CHAR_REF("egrave;", 0xe8),
+  CHAR_REF("egrave", 0xe8),
+  CHAR_REF("egs;", 0x2a96),
+  CHAR_REF("egsdot;", 0x2a98),
+  CHAR_REF("el;", 0x2a99),
+  CHAR_REF("elinters;", 0x23e7),
+  CHAR_REF("ell;", 0x2113),
+  CHAR_REF("els;", 0x2a95),
+  CHAR_REF("elsdot;", 0x2a97),
+  CHAR_REF("emacr;", 0x0113),
+  CHAR_REF("empty;", 0x2205),
+  CHAR_REF("emptyset;", 0x2205),
+  CHAR_REF("emptyv;", 0x2205),
+  CHAR_REF("emsp13;", 0x2004),
+  CHAR_REF("emsp14;", 0x2005),
+  CHAR_REF("emsp;", 0x2003),
+  CHAR_REF("eng;", 0x014b),
+  CHAR_REF("ensp;", 0x2002),
+  CHAR_REF("eogon;", 0x0119),
+  CHAR_REF("eopf;", 0x0001d556),
+  CHAR_REF("epar;", 0x22d5),
+  CHAR_REF("eparsl;", 0x29e3),
+  CHAR_REF("eplus;", 0x2a71),
+  CHAR_REF("epsi;", 0x03b5),
+  CHAR_REF("epsilon;", 0x03b5),
+  CHAR_REF("epsiv;", 0x03f5),
+  CHAR_REF("eqcirc;", 0x2256),
+  CHAR_REF("eqcolon;", 0x2255),
+  CHAR_REF("eqsim;", 0x2242),
+  CHAR_REF("eqslantgtr;", 0x2a96),
+  CHAR_REF("eqslantless;", 0x2a95),
+  CHAR_REF("equals;", 0x3d),
+  CHAR_REF("equest;", 0x225f),
+  CHAR_REF("equiv;", 0x2261),
+  CHAR_REF("equivDD;", 0x2a78),
+  CHAR_REF("eqvparsl;", 0x29e5),
+  CHAR_REF("erDot;", 0x2253),
+  CHAR_REF("erarr;", 0x2971),
+  CHAR_REF("escr;", 0x212f),
+  CHAR_REF("esdot;", 0x2250),
+  CHAR_REF("esim;", 0x2242),
+  CHAR_REF("eta;", 0x03b7),
+  CHAR_REF("eth;", 0xf0),
+  CHAR_REF("eth", 0xf0),
+  CHAR_REF("euml;", 0xeb),
+  CHAR_REF("euml", 0xeb),
+  CHAR_REF("euro;", 0x20ac),
+  CHAR_REF("excl;", 0x21),
+  CHAR_REF("exist;", 0x2203),
+  CHAR_REF("expectation;", 0x2130),
+  CHAR_REF("exponentiale;", 0x2147),
+  CHAR_REF("fallingdotseq;", 0x2252),
+  CHAR_REF("fcy;", 0x0444),
+  CHAR_REF("female;", 0x2640),
+  CHAR_REF("ffilig;", 0xfb03),
+  CHAR_REF("fflig;", 0xfb00),
+  CHAR_REF("ffllig;", 0xfb04),
+  CHAR_REF("ffr;", 0x0001d523),
+  CHAR_REF("filig;", 0xfb01),
+  MULTI_CHAR_REF("fjlig;", 0x66, 0x6a),
+  CHAR_REF("flat;", 0x266d),
+  CHAR_REF("fllig;", 0xfb02),
+  CHAR_REF("fltns;", 0x25b1),
+  CHAR_REF("fnof;", 0x0192),
+  CHAR_REF("fopf;", 0x0001d557),
+  CHAR_REF("forall;", 0x2200),
+  CHAR_REF("fork;", 0x22d4),
+  CHAR_REF("forkv;", 0x2ad9),
+  CHAR_REF("fpartint;", 0x2a0d),
+  CHAR_REF("frac12", 0xbd),
+  CHAR_REF("frac12;", 0xbd),
+  CHAR_REF("frac13;", 0x2153),
+  CHAR_REF("frac14", 0xbc),
+  CHAR_REF("frac14;", 0xbc),
+  CHAR_REF("frac15;", 0x2155),
+  CHAR_REF("frac16;", 0x2159),
+  CHAR_REF("frac18;", 0x215b),
+  CHAR_REF("frac23;", 0x2154),
+  CHAR_REF("frac25;", 0x2156),
+  CHAR_REF("frac34", 0xbe),
+  CHAR_REF("frac34;", 0xbe),
+  CHAR_REF("frac35;", 0x2157),
+  CHAR_REF("frac38;", 0x215c),
+  CHAR_REF("frac45;", 0x2158),
+  CHAR_REF("frac56;", 0x215a),
+  CHAR_REF("frac58;", 0x215d),
+  CHAR_REF("frac78;", 0x215e),
+  CHAR_REF("frasl;", 0x2044),
+  CHAR_REF("frown;", 0x2322),
+  CHAR_REF("fscr;", 0x0001d4bb),
+  CHAR_REF("gE;", 0x2267),
+  CHAR_REF("gEl;", 0x2a8c),
+  CHAR_REF("gacute;", 0x01f5),
+  CHAR_REF("gamma;", 0x03b3),
+  CHAR_REF("gammad;", 0x03dd),
+  CHAR_REF("gap;", 0x2a86),
+  CHAR_REF("gbreve;", 0x011f),
+  CHAR_REF("gcirc;", 0x011d),
+  CHAR_REF("gcy;", 0x0433),
+  CHAR_REF("gdot;", 0x0121),
+  CHAR_REF("ge;", 0x2265),
+  CHAR_REF("gel;", 0x22db),
+  CHAR_REF("geq;", 0x2265),
+  CHAR_REF("geqq;", 0x2267),
+  CHAR_REF("geqslant;", 0x2a7e),
+  CHAR_REF("ges;", 0x2a7e),
+  CHAR_REF("gescc;", 0x2aa9),
+  CHAR_REF("gesdot;", 0x2a80),
+  CHAR_REF("gesdoto;", 0x2a82),
+  CHAR_REF("gesdotol;", 0x2a84),
+  MULTI_CHAR_REF("gesl;", 0x22db, 0xfe00),
+  CHAR_REF("gesles;", 0x2a94),
+  CHAR_REF("gfr;", 0x0001d524),
+  CHAR_REF("gg;", 0x226b),
+  CHAR_REF("ggg;", 0x22d9),
+  CHAR_REF("gimel;", 0x2137),
+  CHAR_REF("gjcy;", 0x0453),
+  CHAR_REF("gl;", 0x2277),
+  CHAR_REF("glE;", 0x2a92),
+  CHAR_REF("gla;", 0x2aa5),
+  CHAR_REF("glj;", 0x2aa4),
+  CHAR_REF("gnE;", 0x2269),
+  CHAR_REF("gnap;", 0x2a8a),
+  CHAR_REF("gnapprox;", 0x2a8a),
+  CHAR_REF("gne;", 0x2a88),
+  CHAR_REF("gneq;", 0x2a88),
+  CHAR_REF("gneqq;", 0x2269),
+  CHAR_REF("gnsim;", 0x22e7),
+  CHAR_REF("gopf;", 0x0001d558),
+  CHAR_REF("grave;", 0x60),
+  CHAR_REF("gscr;", 0x210a),
+  CHAR_REF("gsim;", 0x2273),
+  CHAR_REF("gsime;", 0x2a8e),
+  CHAR_REF("gsiml;", 0x2a90),
+  CHAR_REF("gt;", 0x3e),
+  CHAR_REF("gt", 0x3e),
+  CHAR_REF("gtcc;", 0x2aa7),
+  CHAR_REF("gtcir;", 0x2a7a),
+  CHAR_REF("gtdot;", 0x22d7),
+  CHAR_REF("gtlPar;", 0x2995),
+  CHAR_REF("gtquest;", 0x2a7c),
+  CHAR_REF("gtrapprox;", 0x2a86),
+  CHAR_REF("gtrarr;", 0x2978),
+  CHAR_REF("gtrdot;", 0x22d7),
+  CHAR_REF("gtreqless;", 0x22db),
+  CHAR_REF("gtreqqless;", 0x2a8c),
+  CHAR_REF("gtrless;", 0x2277),
+  CHAR_REF("gtrsim;", 0x2273),
+  MULTI_CHAR_REF("gvertneqq;", 0x2269, 0xfe00),
+  MULTI_CHAR_REF("gvnE;", 0x2269, 0xfe00),
+  CHAR_REF("hArr;", 0x21d4),
+  CHAR_REF("hairsp;", 0x200a),
+  CHAR_REF("half;", 0xbd),
+  CHAR_REF("hamilt;", 0x210b),
+  CHAR_REF("hardcy;", 0x044a),
+  CHAR_REF("harr;", 0x2194),
+  CHAR_REF("harrcir;", 0x2948),
+  CHAR_REF("harrw;", 0x21ad),
+  CHAR_REF("hbar;", 0x210f),
+  CHAR_REF("hcirc;", 0x0125),
+  CHAR_REF("hearts;", 0x2665),
+  CHAR_REF("heartsuit;", 0x2665),
+  CHAR_REF("hellip;", 0x2026),
+  CHAR_REF("hercon;", 0x22b9),
+  CHAR_REF("hfr;", 0x0001d525),
+  CHAR_REF("hksearow;", 0x2925),
+  CHAR_REF("hkswarow;", 0x2926),
+  CHAR_REF("hoarr;", 0x21ff),
+  CHAR_REF("homtht;", 0x223b),
+  CHAR_REF("hookleftarrow;", 0x21a9),
+  CHAR_REF("hookrightarrow;", 0x21aa),
+  CHAR_REF("hopf;", 0x0001d559),
+  CHAR_REF("horbar;", 0x2015),
+  CHAR_REF("hscr;", 0x0001d4bd),
+  CHAR_REF("hslash;", 0x210f),
+  CHAR_REF("hstrok;", 0x0127),
+  CHAR_REF("hybull;", 0x2043),
+  CHAR_REF("hyphen;", 0x2010),
+  CHAR_REF("iacute;", 0xed),
+  CHAR_REF("iacute", 0xed),
+  CHAR_REF("ic;", 0x2063),
+  CHAR_REF("icirc;", 0xee),
+  CHAR_REF("icirc", 0xee),
+  CHAR_REF("icy;", 0x0438),
+  CHAR_REF("iecy;", 0x0435),
+  CHAR_REF("iexcl;", 0xa1),
+  CHAR_REF("iexcl", 0xa1),
+  CHAR_REF("iff;", 0x21d4),
+  CHAR_REF("ifr;", 0x0001d526),
+  CHAR_REF("igrave;", 0xec),
+  CHAR_REF("igrave", 0xec),
+  CHAR_REF("ii;", 0x2148),
+  CHAR_REF("iiiint;", 0x2a0c),
+  CHAR_REF("iiint;", 0x222d),
+  CHAR_REF("iinfin;", 0x29dc),
+  CHAR_REF("iiota;", 0x2129),
+  CHAR_REF("ijlig;", 0x0133),
+  CHAR_REF("imacr;", 0x012b),
+  CHAR_REF("image;", 0x2111),
+  CHAR_REF("imagline;", 0x2110),
+  CHAR_REF("imagpart;", 0x2111),
+  CHAR_REF("imath;", 0x0131),
+  CHAR_REF("imof;", 0x22b7),
+  CHAR_REF("imped;", 0x01b5),
+  CHAR_REF("in;", 0x2208),
+  CHAR_REF("incare;", 0x2105),
+  CHAR_REF("infin;", 0x221e),
+  CHAR_REF("infintie;", 0x29dd),
+  CHAR_REF("inodot;", 0x0131),
+  CHAR_REF("int;", 0x222b),
+  CHAR_REF("intcal;", 0x22ba),
+  CHAR_REF("integers;", 0x2124),
+  CHAR_REF("intercal;", 0x22ba),
+  CHAR_REF("intlarhk;", 0x2a17),
+  CHAR_REF("intprod;", 0x2a3c),
+  CHAR_REF("iocy;", 0x0451),
+  CHAR_REF("iogon;", 0x012f),
+  CHAR_REF("iopf;", 0x0001d55a),
+  CHAR_REF("iota;", 0x03b9),
+  CHAR_REF("iprod;", 0x2a3c),
+  CHAR_REF("iquest;", 0xbf),
+  CHAR_REF("iquest", 0xbf),
+  CHAR_REF("iscr;", 0x0001d4be),
+  CHAR_REF("isin;", 0x2208),
+  CHAR_REF("isinE;", 0x22f9),
+  CHAR_REF("isindot;", 0x22f5),
+  CHAR_REF("isins;", 0x22f4),
+  CHAR_REF("isinsv;", 0x22f3),
+  CHAR_REF("isinv;", 0x2208),
+  CHAR_REF("it;", 0x2062),
+  CHAR_REF("itilde;", 0x0129),
+  CHAR_REF("iukcy;", 0x0456),
+  CHAR_REF("iuml;", 0xef),
+  CHAR_REF("iuml", 0xef),
+  CHAR_REF("jcirc;", 0x0135),
+  CHAR_REF("jcy;", 0x0439),
+  CHAR_REF("jfr;", 0x0001d527),
+  CHAR_REF("jmath;", 0x0237),
+  CHAR_REF("jopf;", 0x0001d55b),
+  CHAR_REF("jscr;", 0x0001d4bf),
+  CHAR_REF("jsercy;", 0x0458),
+  CHAR_REF("jukcy;", 0x0454),
+  CHAR_REF("kappa;", 0x03ba),
+  CHAR_REF("kappav;", 0x03f0),
+  CHAR_REF("kcedil;", 0x0137),
+  CHAR_REF("kcy;", 0x043a),
+  CHAR_REF("kfr;", 0x0001d528),
+  CHAR_REF("kgreen;", 0x0138),
+  CHAR_REF("khcy;", 0x0445),
+  CHAR_REF("kjcy;", 0x045c),
+  CHAR_REF("kopf;", 0x0001d55c),
+  CHAR_REF("kscr;", 0x0001d4c0),
+  CHAR_REF("lAarr;", 0x21da),
+  CHAR_REF("lArr;", 0x21d0),
+  CHAR_REF("lAtail;", 0x291b),
+  CHAR_REF("lBarr;", 0x290e),
+  CHAR_REF("lE;", 0x2266),
+  CHAR_REF("lEg;", 0x2a8b),
+  CHAR_REF("lHar;", 0x2962),
+  CHAR_REF("lacute;", 0x013a),
+  CHAR_REF("laemptyv;", 0x29b4),
+  CHAR_REF("lagran;", 0x2112),
+  CHAR_REF("lambda;", 0x03bb),
+  CHAR_REF("lang;", 0x27e8),
+  CHAR_REF("langd;", 0x2991),
+  CHAR_REF("langle;", 0x27e8),
+  CHAR_REF("lap;", 0x2a85),
+  CHAR_REF("laquo;", 0xab),
+  CHAR_REF("laquo", 0xab),
+  CHAR_REF("larr;", 0x2190),
+  CHAR_REF("larrb;", 0x21e4),
+  CHAR_REF("larrbfs;", 0x291f),
+  CHAR_REF("larrfs;", 0x291d),
+  CHAR_REF("larrhk;", 0x21a9),
+  CHAR_REF("larrlp;", 0x21ab),
+  CHAR_REF("larrpl;", 0x2939),
+  CHAR_REF("larrsim;", 0x2973),
+  CHAR_REF("larrtl;", 0x21a2),
+  CHAR_REF("lat;", 0x2aab),
+  CHAR_REF("latail;", 0x2919),
+  CHAR_REF("late;", 0x2aad),
+  MULTI_CHAR_REF("lates;", 0x2aad, 0xfe00),
+  CHAR_REF("lbarr;", 0x290c),
+  CHAR_REF("lbbrk;", 0x2772),
+  CHAR_REF("lbrace;", 0x7b),
+  CHAR_REF("lbrack;", 0x5b),
+  CHAR_REF("lbrke;", 0x298b),
+  CHAR_REF("lbrksld;", 0x298f),
+  CHAR_REF("lbrkslu;", 0x298d),
+  CHAR_REF("lcaron;", 0x013e),
+  CHAR_REF("lcedil;", 0x013c),
+  CHAR_REF("lceil;", 0x2308),
+  CHAR_REF("lcub;", 0x7b),
+  CHAR_REF("lcy;", 0x043b),
+  CHAR_REF("ldca;", 0x2936),
+  CHAR_REF("ldquo;", 0x201c),
+  CHAR_REF("ldquor;", 0x201e),
+  CHAR_REF("ldrdhar;", 0x2967),
+  CHAR_REF("ldrushar;", 0x294b),
+  CHAR_REF("ldsh;", 0x21b2),
+  CHAR_REF("le;", 0x2264),
+  CHAR_REF("leftarrow;", 0x2190),
+  CHAR_REF("leftarrowtail;", 0x21a2),
+  CHAR_REF("leftharpoondown;", 0x21bd),
+  CHAR_REF("leftharpoonup;", 0x21bc),
+  CHAR_REF("leftleftarrows;", 0x21c7),
+  CHAR_REF("leftrightarrow;", 0x2194),
+  CHAR_REF("leftrightarrows;", 0x21c6),
+  CHAR_REF("leftrightharpoons;", 0x21cb),
+  CHAR_REF("leftrightsquigarrow;", 0x21ad),
+  CHAR_REF("leftthreetimes;", 0x22cb),
+  CHAR_REF("leg;", 0x22da),
+  CHAR_REF("leq;", 0x2264),
+  CHAR_REF("leqq;", 0x2266),
+  CHAR_REF("leqslant;", 0x2a7d),
+  CHAR_REF("les;", 0x2a7d),
+  CHAR_REF("lescc;", 0x2aa8),
+  CHAR_REF("lesdot;", 0x2a7f),
+  CHAR_REF("lesdoto;", 0x2a81),
+  CHAR_REF("lesdotor;", 0x2a83),
+  MULTI_CHAR_REF("lesg;", 0x22da, 0xfe00),
+  CHAR_REF("lesges;", 0x2a93),
+  CHAR_REF("lessapprox;", 0x2a85),
+  CHAR_REF("lessdot;", 0x22d6),
+  CHAR_REF("lesseqgtr;", 0x22da),
+  CHAR_REF("lesseqqgtr;", 0x2a8b),
+  CHAR_REF("lessgtr;", 0x2276),
+  CHAR_REF("lesssim;", 0x2272),
+  CHAR_REF("lfisht;", 0x297c),
+  CHAR_REF("lfloor;", 0x230a),
+  CHAR_REF("lfr;", 0x0001d529),
+  CHAR_REF("lg;", 0x2276),
+  CHAR_REF("lgE;", 0x2a91),
+  CHAR_REF("lhard;", 0x21bd),
+  CHAR_REF("lharu;", 0x21bc),
+  CHAR_REF("lharul;", 0x296a),
+  CHAR_REF("lhblk;", 0x2584),
+  CHAR_REF("ljcy;", 0x0459),
+  CHAR_REF("ll;", 0x226a),
+  CHAR_REF("llarr;", 0x21c7),
+  CHAR_REF("llcorner;", 0x231e),
+  CHAR_REF("llhard;", 0x296b),
+  CHAR_REF("lltri;", 0x25fa),
+  CHAR_REF("lmidot;", 0x0140),
+  CHAR_REF("lmoust;", 0x23b0),
+  CHAR_REF("lmoustache;", 0x23b0),
+  CHAR_REF("lnE;", 0x2268),
+  CHAR_REF("lnap;", 0x2a89),
+  CHAR_REF("lnapprox;", 0x2a89),
+  CHAR_REF("lne;", 0x2a87),
+  CHAR_REF("lneq;", 0x2a87),
+  CHAR_REF("lneqq;", 0x2268),
+  CHAR_REF("lnsim;", 0x22e6),
+  CHAR_REF("loang;", 0x27ec),
+  CHAR_REF("loarr;", 0x21fd),
+  CHAR_REF("lobrk;", 0x27e6),
+  CHAR_REF("longleftarrow;", 0x27f5),
+  CHAR_REF("longleftrightarrow;", 0x27f7),
+  CHAR_REF("longmapsto;", 0x27fc),
+  CHAR_REF("longrightarrow;", 0x27f6),
+  CHAR_REF("looparrowleft;", 0x21ab),
+  CHAR_REF("looparrowright;", 0x21ac),
+  CHAR_REF("lopar;", 0x2985),
+  CHAR_REF("lopf;", 0x0001d55d),
+  CHAR_REF("loplus;", 0x2a2d),
+  CHAR_REF("lotimes;", 0x2a34),
+  CHAR_REF("lowast;", 0x2217),
+  CHAR_REF("lowbar;", 0x5f),
+  CHAR_REF("loz;", 0x25ca),
+  CHAR_REF("lozenge;", 0x25ca),
+  CHAR_REF("lozf;", 0x29eb),
+  CHAR_REF("lpar;", 0x28),
+  CHAR_REF("lparlt;", 0x2993),
+  CHAR_REF("lrarr;", 0x21c6),
+  CHAR_REF("lrcorner;", 0x231f),
+  CHAR_REF("lrhar;", 0x21cb),
+  CHAR_REF("lrhard;", 0x296d),
+  CHAR_REF("lrm;", 0x200e),
+  CHAR_REF("lrtri;", 0x22bf),
+  CHAR_REF("lsaquo;", 0x2039),
+  CHAR_REF("lscr;", 0x0001d4c1),
+  CHAR_REF("lsh;", 0x21b0),
+  CHAR_REF("lsim;", 0x2272),
+  CHAR_REF("lsime;", 0x2a8d),
+  CHAR_REF("lsimg;", 0x2a8f),
+  CHAR_REF("lsqb;", 0x5b),
+  CHAR_REF("lsquo;", 0x2018),
+  CHAR_REF("lsquor;", 0x201a),
+  CHAR_REF("lstrok;", 0x0142),
+  CHAR_REF("lt;", 0x3c),
+  CHAR_REF("lt", 0x3c),
+  CHAR_REF("ltcc;", 0x2aa6),
+  CHAR_REF("ltcir;", 0x2a79),
+  CHAR_REF("ltdot;", 0x22d6),
+  CHAR_REF("lthree;", 0x22cb),
+  CHAR_REF("ltimes;", 0x22c9),
+  CHAR_REF("ltlarr;", 0x2976),
+  CHAR_REF("ltquest;", 0x2a7b),
+  CHAR_REF("ltrPar;", 0x2996),
+  CHAR_REF("ltri;", 0x25c3),
+  CHAR_REF("ltrie;", 0x22b4),
+  CHAR_REF("ltrif;", 0x25c2),
+  CHAR_REF("lurdshar;", 0x294a),
+  CHAR_REF("luruhar;", 0x2966),
+  MULTI_CHAR_REF("lvertneqq;", 0x2268, 0xfe00),
+  MULTI_CHAR_REF("lvnE;", 0x2268, 0xfe00),
+  CHAR_REF("mDDot;", 0x223a),
+  CHAR_REF("macr;", 0xaf),
+  CHAR_REF("macr", 0xaf),
+  CHAR_REF("male;", 0x2642),
+  CHAR_REF("malt;", 0x2720),
+  CHAR_REF("maltese;", 0x2720),
+  CHAR_REF("map;", 0x21a6),
+  CHAR_REF("mapsto;", 0x21a6),
+  CHAR_REF("mapstodown;", 0x21a7),
+  CHAR_REF("mapstoleft;", 0x21a4),
+  CHAR_REF("mapstoup;", 0x21a5),
+  CHAR_REF("marker;", 0x25ae),
+  CHAR_REF("mcomma;", 0x2a29),
+  CHAR_REF("mcy;", 0x043c),
+  CHAR_REF("mdash;", 0x2014),
+  CHAR_REF("measuredangle;", 0x2221),
+  CHAR_REF("mfr;", 0x0001d52a),
+  CHAR_REF("mho;", 0x2127),
+  CHAR_REF("micro;", 0xb5),
+  CHAR_REF("micro", 0xb5),
+  CHAR_REF("mid;", 0x2223),
+  CHAR_REF("midast;", 0x2a),
+  CHAR_REF("midcir;", 0x2af0),
+  CHAR_REF("middot;", 0xb7),
+  CHAR_REF("middot", 0xb7),
+  CHAR_REF("minus;", 0x2212),
+  CHAR_REF("minusb;", 0x229f),
+  CHAR_REF("minusd;", 0x2238),
+  CHAR_REF("minusdu;", 0x2a2a),
+  CHAR_REF("mlcp;", 0x2adb),
+  CHAR_REF("mldr;", 0x2026),
+  CHAR_REF("mnplus;", 0x2213),
+  CHAR_REF("models;", 0x22a7),
+  CHAR_REF("mopf;", 0x0001d55e),
+  CHAR_REF("mp;", 0x2213),
+  CHAR_REF("mscr;", 0x0001d4c2),
+  CHAR_REF("mstpos;", 0x223e),
+  CHAR_REF("mu;", 0x03bc),
+  CHAR_REF("multimap;", 0x22b8),
+  CHAR_REF("mumap;", 0x22b8),
+  MULTI_CHAR_REF("nGg;", 0x22d9, 0x0338),
+  MULTI_CHAR_REF("nGt;", 0x226b, 0x20d2),
+  MULTI_CHAR_REF("nGtv;", 0x226b, 0x0338),
+  CHAR_REF("nLeftarrow;", 0x21cd),
+  CHAR_REF("nLeftrightarrow;", 0x21ce),
+  MULTI_CHAR_REF("nLl;", 0x22d8, 0x0338),
+  MULTI_CHAR_REF("nLt;", 0x226a, 0x20d2),
+  MULTI_CHAR_REF("nLtv;", 0x226a, 0x0338),
+  CHAR_REF("nRightarrow;", 0x21cf),
+  CHAR_REF("nVDash;", 0x22af),
+  CHAR_REF("nVdash;", 0x22ae),
+  CHAR_REF("nabla;", 0x2207),
+  CHAR_REF("nacute;", 0x0144),
+  MULTI_CHAR_REF("nang;", 0x2220, 0x20d2),
+  CHAR_REF("nap;", 0x2249),
+  MULTI_CHAR_REF("napE;", 0x2a70, 0x0338),
+  MULTI_CHAR_REF("napid;", 0x224b, 0x0338),
+  CHAR_REF("napos;", 0x0149),
+  CHAR_REF("napprox;", 0x2249),
+  CHAR_REF("natur;", 0x266e),
+  CHAR_REF("natural;", 0x266e),
+  CHAR_REF("naturals;", 0x2115),
+  CHAR_REF("nbsp;", 0xa0),
+  CHAR_REF("nbsp", 0xa0),
+  MULTI_CHAR_REF("nbump;", 0x224e, 0x0338),
+  MULTI_CHAR_REF("nbumpe;", 0x224f, 0x0338),
+  CHAR_REF("ncap;", 0x2a43),
+  CHAR_REF("ncaron;", 0x0148),
+  CHAR_REF("ncedil;", 0x0146),
+  CHAR_REF("ncong;", 0x2247),
+  MULTI_CHAR_REF("ncongdot;", 0x2a6d, 0x0338),
+  CHAR_REF("ncup;", 0x2a42),
+  CHAR_REF("ncy;", 0x043d),
+  CHAR_REF("ndash;", 0x2013),
+  CHAR_REF("ne;", 0x2260),
+  CHAR_REF("neArr;", 0x21d7),
+  CHAR_REF("nearhk;", 0x2924),
+  CHAR_REF("nearr;", 0x2197),
+  CHAR_REF("nearrow;", 0x2197),
+  MULTI_CHAR_REF("nedot;", 0x2250, 0x0338),
+  CHAR_REF("nequiv;", 0x2262),
+  CHAR_REF("nesear;", 0x2928),
+  MULTI_CHAR_REF("nesim;", 0x2242, 0x0338),
+  CHAR_REF("nexist;", 0x2204),
+  CHAR_REF("nexists;", 0x2204),
+  CHAR_REF("nfr;", 0x0001d52b),
+  MULTI_CHAR_REF("ngE;", 0x2267, 0x0338),
+  CHAR_REF("nge;", 0x2271),
+  CHAR_REF("ngeq;", 0x2271),
+  MULTI_CHAR_REF("ngeqq;", 0x2267, 0x0338),
+  MULTI_CHAR_REF("ngeqslant;", 0x2a7e, 0x0338),
+  MULTI_CHAR_REF("nges;", 0x2a7e, 0x0338),
+  CHAR_REF("ngsim;", 0x2275),
+  CHAR_REF("ngt;", 0x226f),
+  CHAR_REF("ngtr;", 0x226f),
+  CHAR_REF("nhArr;", 0x21ce),
+  CHAR_REF("nharr;", 0x21ae),
+  CHAR_REF("nhpar;", 0x2af2),
+  CHAR_REF("ni;", 0x220b),
+  CHAR_REF("nis;", 0x22fc),
+  CHAR_REF("nisd;", 0x22fa),
+  CHAR_REF("niv;", 0x220b),
+  CHAR_REF("njcy;", 0x045a),
+  CHAR_REF("nlArr;", 0x21cd),
+  MULTI_CHAR_REF("nlE;", 0x2266, 0x0338),
+  CHAR_REF("nlarr;", 0x219a),
+  CHAR_REF("nldr;", 0x2025),
+  CHAR_REF("nle;", 0x2270),
+  CHAR_REF("nleftarrow;", 0x219a),
+  CHAR_REF("nleftrightarrow;", 0x21ae),
+  CHAR_REF("nleq;", 0x2270),
+  MULTI_CHAR_REF("nleqq;", 0x2266, 0x0338),
+  MULTI_CHAR_REF("nleqslant;", 0x2a7d, 0x0338),
+  MULTI_CHAR_REF("nles;", 0x2a7d, 0x0338),
+  CHAR_REF("nless;", 0x226e),
+  CHAR_REF("nlsim;", 0x2274),
+  CHAR_REF("nlt;", 0x226e),
+  CHAR_REF("nltri;", 0x22ea),
+  CHAR_REF("nltrie;", 0x22ec),
+  CHAR_REF("nmid;", 0x2224),
+  CHAR_REF("nopf;", 0x0001d55f),
+  CHAR_REF("not;", 0xac),
+  CHAR_REF("notin;", 0x2209),
+  MULTI_CHAR_REF("notinE;", 0x22f9, 0x0338),
+  MULTI_CHAR_REF("notindot;", 0x22f5, 0x0338),
+  CHAR_REF("notinva;", 0x2209),
+  CHAR_REF("notinvb;", 0x22f7),
+  CHAR_REF("notinvc;", 0x22f6),
+  CHAR_REF("notni;", 0x220c),
+  CHAR_REF("notniva;", 0x220c),
+  CHAR_REF("notnivb;", 0x22fe),
+  CHAR_REF("notnivc;", 0x22fd),
+  CHAR_REF("not", 0xac),
+  CHAR_REF("npar;", 0x2226),
+  CHAR_REF("nparallel;", 0x2226),
+  MULTI_CHAR_REF("nparsl;", 0x2afd, 0x20e5),
+  MULTI_CHAR_REF("npart;", 0x2202, 0x0338),
+  CHAR_REF("npolint;", 0x2a14),
+  CHAR_REF("npr;", 0x2280),
+  CHAR_REF("nprcue;", 0x22e0),
+  MULTI_CHAR_REF("npre;", 0x2aaf, 0x0338),
+  CHAR_REF("nprec;", 0x2280),
+  MULTI_CHAR_REF("npreceq;", 0x2aaf, 0x0338),
+  CHAR_REF("nrArr;", 0x21cf),
+  CHAR_REF("nrarr;", 0x219b),
+  MULTI_CHAR_REF("nrarrc;", 0x2933, 0x0338),
+  MULTI_CHAR_REF("nrarrw;", 0x219d, 0x0338),
+  CHAR_REF("nrightarrow;", 0x219b),
+  CHAR_REF("nrtri;", 0x22eb),
+  CHAR_REF("nrtrie;", 0x22ed),
+  CHAR_REF("nsc;", 0x2281),
+  CHAR_REF("nsccue;", 0x22e1),
+  MULTI_CHAR_REF("nsce;", 0x2ab0, 0x0338),
+  CHAR_REF("nscr;", 0x0001d4c3),
+  CHAR_REF("nshortmid;", 0x2224),
+  CHAR_REF("nshortparallel;", 0x2226),
+  CHAR_REF("nsim;", 0x2241),
+  CHAR_REF("nsime;", 0x2244),
+  CHAR_REF("nsimeq;", 0x2244),
+  CHAR_REF("nsmid;", 0x2224),
+  CHAR_REF("nspar;", 0x2226),
+  CHAR_REF("nsqsube;", 0x22e2),
+  CHAR_REF("nsqsupe;", 0x22e3),
+  CHAR_REF("nsub;", 0x2284),
+  MULTI_CHAR_REF("nsubE;", 0x2ac5, 0x0338),
+  CHAR_REF("nsube;", 0x2288),
+  MULTI_CHAR_REF("nsubset;", 0x2282, 0x20d2),
+  CHAR_REF("nsubseteq;", 0x2288),
+  MULTI_CHAR_REF("nsubseteqq;", 0x2ac5, 0x0338),
+  CHAR_REF("nsucc;", 0x2281),
+  MULTI_CHAR_REF("nsucceq;", 0x2ab0, 0x0338),
+  CHAR_REF("nsup;", 0x2285),
+  MULTI_CHAR_REF("nsupE;", 0x2ac6, 0x0338),
+  CHAR_REF("nsupe;", 0x2289),
+  MULTI_CHAR_REF("nsupset;", 0x2283, 0x20d2),
+  CHAR_REF("nsupseteq;", 0x2289),
+  MULTI_CHAR_REF("nsupseteqq;", 0x2ac6, 0x0338),
+  CHAR_REF("ntgl;", 0x2279),
+  CHAR_REF("ntilde;", 0xf1),
+  CHAR_REF("ntilde", 0xf1),
+  CHAR_REF("ntlg;", 0x2278),
+  CHAR_REF("ntriangleleft;", 0x22ea),
+  CHAR_REF("ntrianglelefteq;", 0x22ec),
+  CHAR_REF("ntriangleright;", 0x22eb),
+  CHAR_REF("ntrianglerighteq;", 0x22ed),
+  CHAR_REF("nu;", 0x03bd),
+  CHAR_REF("num;", 0x23),
+  CHAR_REF("numero;", 0x2116),
+  CHAR_REF("numsp;", 0x2007),
+  CHAR_REF("nvDash;", 0x22ad),
+  CHAR_REF("nvHarr;", 0x2904),
+  MULTI_CHAR_REF("nvap;", 0x224d, 0x20d2),
+  CHAR_REF("nvdash;", 0x22ac),
+  MULTI_CHAR_REF("nvge;", 0x2265, 0x20d2),
+  MULTI_CHAR_REF("nvgt;", 0x3e, 0x20d2),
+  CHAR_REF("nvinfin;", 0x29de),
+  CHAR_REF("nvlArr;", 0x2902),
+  MULTI_CHAR_REF("nvle;", 0x2264, 0x20d2),
+  MULTI_CHAR_REF("nvlt;", 0x3c, 0x20d2),
+  MULTI_CHAR_REF("nvltrie;", 0x22b4, 0x20d2),
+  CHAR_REF("nvrArr;", 0x2903),
+  MULTI_CHAR_REF("nvrtrie;", 0x22b5, 0x20d2),
+  MULTI_CHAR_REF("nvsim;", 0x223c, 0x20d2),
+  CHAR_REF("nwArr;", 0x21d6),
+  CHAR_REF("nwarhk;", 0x2923),
+  CHAR_REF("nwarr;", 0x2196),
+  CHAR_REF("nwarrow;", 0x2196),
+  CHAR_REF("nwnear;", 0x2927),
+  CHAR_REF("oS;", 0x24c8),
+  CHAR_REF("oacute;", 0xf3),
+  CHAR_REF("oacute", 0xf3),
+  CHAR_REF("oast;", 0x229b),
+  CHAR_REF("ocir;", 0x229a),
+  CHAR_REF("ocirc;", 0xf4),
+  CHAR_REF("ocirc", 0xf4),
+  CHAR_REF("ocy;", 0x043e),
+  CHAR_REF("odash;", 0x229d),
+  CHAR_REF("odblac;", 0x0151),
+  CHAR_REF("odiv;", 0x2a38),
+  CHAR_REF("odot;", 0x2299),
+  CHAR_REF("odsold;", 0x29bc),
+  CHAR_REF("oelig;", 0x0153),
+  CHAR_REF("ofcir;", 0x29bf),
+  CHAR_REF("ofr;", 0x0001d52c),
+  CHAR_REF("ogon;", 0x02db),
+  CHAR_REF("ograve;", 0xf2),
+  CHAR_REF("ograve", 0xf2),
+  CHAR_REF("ogt;", 0x29c1),
+  CHAR_REF("ohbar;", 0x29b5),
+  CHAR_REF("ohm;", 0x03a9),
+  CHAR_REF("oint;", 0x222e),
+  CHAR_REF("olarr;", 0x21ba),
+  CHAR_REF("olcir;", 0x29be),
+  CHAR_REF("olcross;", 0x29bb),
+  CHAR_REF("oline;", 0x203e),
+  CHAR_REF("olt;", 0x29c0),
+  CHAR_REF("omacr;", 0x014d),
+  CHAR_REF("omega;", 0x03c9),
+  CHAR_REF("omicron;", 0x03bf),
+  CHAR_REF("omid;", 0x29b6),
+  CHAR_REF("ominus;", 0x2296),
+  CHAR_REF("oopf;", 0x0001d560),
+  CHAR_REF("opar;", 0x29b7),
+  CHAR_REF("operp;", 0x29b9),
+  CHAR_REF("oplus;", 0x2295),
+  CHAR_REF("or;", 0x2228),
+  CHAR_REF("orarr;", 0x21bb),
+  CHAR_REF("ord;", 0x2a5d),
+  CHAR_REF("order;", 0x2134),
+  CHAR_REF("orderof;", 0x2134),
+  CHAR_REF("ordf;", 0xaa),
+  CHAR_REF("ordf", 0xaa),
+  CHAR_REF("ordm;", 0xba),
+  CHAR_REF("ordm", 0xba),
+  CHAR_REF("origof;", 0x22b6),
+  CHAR_REF("oror;", 0x2a56),
+  CHAR_REF("orslope;", 0x2a57),
+  CHAR_REF("orv;", 0x2a5b),
+  CHAR_REF("oscr;", 0x2134),
+  CHAR_REF("oslash;", 0xf8),
+  CHAR_REF("oslash", 0xf8),
+  CHAR_REF("osol;", 0x2298),
+  CHAR_REF("otilde;", 0xf5),
+  CHAR_REF("otilde", 0xf5),
+  CHAR_REF("otimes;", 0x2297),
+  CHAR_REF("otimesas;", 0x2a36),
+  CHAR_REF("ouml;", 0xf6),
+  CHAR_REF("ouml", 0xf6),
+  CHAR_REF("ovbar;", 0x233d),
+  CHAR_REF("par;", 0x2225),
+  CHAR_REF("para;", 0xb6),
+  CHAR_REF("para", 0xb6),
+  CHAR_REF("parallel;", 0x2225),
+  CHAR_REF("parsim;", 0x2af3),
+  CHAR_REF("parsl;", 0x2afd),
+  CHAR_REF("part;", 0x2202),
+  CHAR_REF("pcy;", 0x043f),
+  CHAR_REF("percnt;", 0x25),
+  CHAR_REF("period;", 0x2e),
+  CHAR_REF("permil;", 0x2030),
+  CHAR_REF("perp;", 0x22a5),
+  CHAR_REF("pertenk;", 0x2031),
+  CHAR_REF("pfr;", 0x0001d52d),
+  CHAR_REF("phi;", 0x03c6),
+  CHAR_REF("phiv;", 0x03d5),
+  CHAR_REF("phmmat;", 0x2133),
+  CHAR_REF("phone;", 0x260e),
+  CHAR_REF("pi;", 0x03c0),
+  CHAR_REF("pitchfork;", 0x22d4),
+  CHAR_REF("piv;", 0x03d6),
+  CHAR_REF("planck;", 0x210f),
+  CHAR_REF("planckh;", 0x210e),
+  CHAR_REF("plankv;", 0x210f),
+  CHAR_REF("plus;", 0x2b),
+  CHAR_REF("plusacir;", 0x2a23),
+  CHAR_REF("plusb;", 0x229e),
+  CHAR_REF("pluscir;", 0x2a22),
+  CHAR_REF("plusdo;", 0x2214),
+  CHAR_REF("plusdu;", 0x2a25),
+  CHAR_REF("pluse;", 0x2a72),
+  CHAR_REF("plusmn;", 0xb1),
+  CHAR_REF("plusmn", 0xb1),
+  CHAR_REF("plussim;", 0x2a26),
+  CHAR_REF("plustwo;", 0x2a27),
+  CHAR_REF("pm;", 0xb1),
+  CHAR_REF("pointint;", 0x2a15),
+  CHAR_REF("popf;", 0x0001d561),
+  CHAR_REF("pound;", 0xa3),
+  CHAR_REF("pound", 0xa3),
+  CHAR_REF("pr;", 0x227a),
+  CHAR_REF("prE;", 0x2ab3),
+  CHAR_REF("prap;", 0x2ab7),
+  CHAR_REF("prcue;", 0x227c),
+  CHAR_REF("pre;", 0x2aaf),
+  CHAR_REF("prec;", 0x227a),
+  CHAR_REF("precapprox;", 0x2ab7),
+  CHAR_REF("preccurlyeq;", 0x227c),
+  CHAR_REF("preceq;", 0x2aaf),
+  CHAR_REF("precnapprox;", 0x2ab9),
+  CHAR_REF("precneqq;", 0x2ab5),
+  CHAR_REF("precnsim;", 0x22e8),
+  CHAR_REF("precsim;", 0x227e),
+  CHAR_REF("prime;", 0x2032),
+  CHAR_REF("primes;", 0x2119),
+  CHAR_REF("prnE;", 0x2ab5),
+  CHAR_REF("prnap;", 0x2ab9),
+  CHAR_REF("prnsim;", 0x22e8),
+  CHAR_REF("prod;", 0x220f),
+  CHAR_REF("profalar;", 0x232e),
+  CHAR_REF("profline;", 0x2312),
+  CHAR_REF("profsurf;", 0x2313),
+  CHAR_REF("prop;", 0x221d),
+  CHAR_REF("propto;", 0x221d),
+  CHAR_REF("prsim;", 0x227e),
+  CHAR_REF("prurel;", 0x22b0),
+  CHAR_REF("pscr;", 0x0001d4c5),
+  CHAR_REF("psi;", 0x03c8),
+  CHAR_REF("puncsp;", 0x2008),
+  CHAR_REF("qfr;", 0x0001d52e),
+  CHAR_REF("qint;", 0x2a0c),
+  CHAR_REF("qopf;", 0x0001d562),
+  CHAR_REF("qprime;", 0x2057),
+  CHAR_REF("qscr;", 0x0001d4c6),
+  CHAR_REF("quaternions;", 0x210d),
+  CHAR_REF("quatint;", 0x2a16),
+  CHAR_REF("quest;", 0x3f),
+  CHAR_REF("questeq;", 0x225f),
+  CHAR_REF("quot;", 0x22),
+  CHAR_REF("quot", 0x22),
+  CHAR_REF("rAarr;", 0x21db),
+  CHAR_REF("rArr;", 0x21d2),
+  CHAR_REF("rAtail;", 0x291c),
+  CHAR_REF("rBarr;", 0x290f),
+  CHAR_REF("rHar;", 0x2964),
+  MULTI_CHAR_REF("race;", 0x223d, 0x0331),
+  CHAR_REF("racute;", 0x0155),
+  CHAR_REF("radic;", 0x221a),
+  CHAR_REF("raemptyv;", 0x29b3),
+  CHAR_REF("rang;", 0x27e9),
+  CHAR_REF("rangd;", 0x2992),
+  CHAR_REF("range;", 0x29a5),
+  CHAR_REF("rangle;", 0x27e9),
+  CHAR_REF("raquo;", 0xbb),
+  CHAR_REF("raquo", 0xbb),
+  CHAR_REF("rarr;", 0x2192),
+  CHAR_REF("rarrap;", 0x2975),
+  CHAR_REF("rarrb;", 0x21e5),
+  CHAR_REF("rarrbfs;", 0x2920),
+  CHAR_REF("rarrc;", 0x2933),
+  CHAR_REF("rarrfs;", 0x291e),
+  CHAR_REF("rarrhk;", 0x21aa),
+  CHAR_REF("rarrlp;", 0x21ac),
+  CHAR_REF("rarrpl;", 0x2945),
+  CHAR_REF("rarrsim;", 0x2974),
+  CHAR_REF("rarrtl;", 0x21a3),
+  CHAR_REF("rarrw;", 0x219d),
+  CHAR_REF("ratail;", 0x291a),
+  CHAR_REF("ratio;", 0x2236),
+  CHAR_REF("rationals;", 0x211a),
+  CHAR_REF("rbarr;", 0x290d),
+  CHAR_REF("rbbrk;", 0x2773),
+  CHAR_REF("rbrace;", 0x7d),
+  CHAR_REF("rbrack;", 0x5d),
+  CHAR_REF("rbrke;", 0x298c),
+  CHAR_REF("rbrksld;", 0x298e),
+  CHAR_REF("rbrkslu;", 0x2990),
+  CHAR_REF("rcaron;", 0x0159),
+  CHAR_REF("rcedil;", 0x0157),
+  CHAR_REF("rceil;", 0x2309),
+  CHAR_REF("rcub;", 0x7d),
+  CHAR_REF("rcy;", 0x0440),
+  CHAR_REF("rdca;", 0x2937),
+  CHAR_REF("rdldhar;", 0x2969),
+  CHAR_REF("rdquo;", 0x201d),
+  CHAR_REF("rdquor;", 0x201d),
+  CHAR_REF("rdsh;", 0x21b3),
+  CHAR_REF("real;", 0x211c),
+  CHAR_REF("realine;", 0x211b),
+  CHAR_REF("realpart;", 0x211c),
+  CHAR_REF("reals;", 0x211d),
+  CHAR_REF("rect;", 0x25ad),
+  CHAR_REF("reg;", 0xae),
+  CHAR_REF("reg", 0xae),
+  CHAR_REF("rfisht;", 0x297d),
+  CHAR_REF("rfloor;", 0x230b),
+  CHAR_REF("rfr;", 0x0001d52f),
+  CHAR_REF("rhard;", 0x21c1),
+  CHAR_REF("rharu;", 0x21c0),
+  CHAR_REF("rharul;", 0x296c),
+  CHAR_REF("rho;", 0x03c1),
+  CHAR_REF("rhov;", 0x03f1),
+  CHAR_REF("rightarrow;", 0x2192),
+  CHAR_REF("rightarrowtail;", 0x21a3),
+  CHAR_REF("rightharpoondown;", 0x21c1),
+  CHAR_REF("rightharpoonup;", 0x21c0),
+  CHAR_REF("rightleftarrows;", 0x21c4),
+  CHAR_REF("rightleftharpoons;", 0x21cc),
+  CHAR_REF("rightrightarrows;", 0x21c9),
+  CHAR_REF("rightsquigarrow;", 0x219d),
+  CHAR_REF("rightthreetimes;", 0x22cc),
+  CHAR_REF("ring;", 0x02da),
+  CHAR_REF("risingdotseq;", 0x2253),
+  CHAR_REF("rlarr;", 0x21c4),
+  CHAR_REF("rlhar;", 0x21cc),
+  CHAR_REF("rlm;", 0x200f),
+  CHAR_REF("rmoust;", 0x23b1),
+  CHAR_REF("rmoustache;", 0x23b1),
+  CHAR_REF("rnmid;", 0x2aee),
+  CHAR_REF("roang;", 0x27ed),
+  CHAR_REF("roarr;", 0x21fe),
+  CHAR_REF("robrk;", 0x27e7),
+  CHAR_REF("ropar;", 0x2986),
+  CHAR_REF("ropf;", 0x0001d563),
+  CHAR_REF("roplus;", 0x2a2e),
+  CHAR_REF("rotimes;", 0x2a35),
+  CHAR_REF("rpar;", 0x29),
+  CHAR_REF("rpargt;", 0x2994),
+  CHAR_REF("rppolint;", 0x2a12),
+  CHAR_REF("rrarr;", 0x21c9),
+  CHAR_REF("rsaquo;", 0x203a),
+  CHAR_REF("rscr;", 0x0001d4c7),
+  CHAR_REF("rsh;", 0x21b1),
+  CHAR_REF("rsqb;", 0x5d),
+  CHAR_REF("rsquo;", 0x2019),
+  CHAR_REF("rsquor;", 0x2019),
+  CHAR_REF("rthree;", 0x22cc),
+  CHAR_REF("rtimes;", 0x22ca),
+  CHAR_REF("rtri;", 0x25b9),
+  CHAR_REF("rtrie;", 0x22b5),
+  CHAR_REF("rtrif;", 0x25b8),
+  CHAR_REF("rtriltri;", 0x29ce),
+  CHAR_REF("ruluhar;", 0x2968),
+  CHAR_REF("rx;", 0x211e),
+  CHAR_REF("sacute;", 0x015b),
+  CHAR_REF("sbquo;", 0x201a),
+  CHAR_REF("sc;", 0x227b),
+  CHAR_REF("scE;", 0x2ab4),
+  CHAR_REF("scap;", 0x2ab8),
+  CHAR_REF("scaron;", 0x0161),
+  CHAR_REF("sccue;", 0x227d),
+  CHAR_REF("sce;", 0x2ab0),
+  CHAR_REF("scedil;", 0x015f),
+  CHAR_REF("scirc;", 0x015d),
+  CHAR_REF("scnE;", 0x2ab6),
+  CHAR_REF("scnap;", 0x2aba),
+  CHAR_REF("scnsim;", 0x22e9),
+  CHAR_REF("scpolint;", 0x2a13),
+  CHAR_REF("scsim;", 0x227f),
+  CHAR_REF("scy;", 0x0441),
+  CHAR_REF("sdot;", 0x22c5),
+  CHAR_REF("sdotb;", 0x22a1),
+  CHAR_REF("sdote;", 0x2a66),
+  CHAR_REF("seArr;", 0x21d8),
+  CHAR_REF("searhk;", 0x2925),
+  CHAR_REF("searr;", 0x2198),
+  CHAR_REF("searrow;", 0x2198),
+  CHAR_REF("sect;", 0xa7),
+  CHAR_REF("sect", 0xa7),
+  CHAR_REF("semi;", 0x3b),
+  CHAR_REF("seswar;", 0x2929),
+  CHAR_REF("setminus;", 0x2216),
+  CHAR_REF("setmn;", 0x2216),
+  CHAR_REF("sext;", 0x2736),
+  CHAR_REF("sfr;", 0x0001d530),
+  CHAR_REF("sfrown;", 0x2322),
+  CHAR_REF("sharp;", 0x266f),
+  CHAR_REF("shchcy;", 0x0449),
+  CHAR_REF("shcy;", 0x0448),
+  CHAR_REF("shortmid;", 0x2223),
+  CHAR_REF("shortparallel;", 0x2225),
+  CHAR_REF("shy;", 0xad),
+  CHAR_REF("shy", 0xad),
+  CHAR_REF("sigma;", 0x03c3),
+  CHAR_REF("sigmaf;", 0x03c2),
+  CHAR_REF("sigmav;", 0x03c2),
+  CHAR_REF("sim;", 0x223c),
+  CHAR_REF("simdot;", 0x2a6a),
+  CHAR_REF("sime;", 0x2243),
+  CHAR_REF("simeq;", 0x2243),
+  CHAR_REF("simg;", 0x2a9e),
+  CHAR_REF("simgE;", 0x2aa0),
+  CHAR_REF("siml;", 0x2a9d),
+  CHAR_REF("simlE;", 0x2a9f),
+  CHAR_REF("simne;", 0x2246),
+  CHAR_REF("simplus;", 0x2a24),
+  CHAR_REF("simrarr;", 0x2972),
+  CHAR_REF("slarr;", 0x2190),
+  CHAR_REF("smallsetminus;", 0x2216),
+  CHAR_REF("smashp;", 0x2a33),
+  CHAR_REF("smeparsl;", 0x29e4),
+  CHAR_REF("smid;", 0x2223),
+  CHAR_REF("smile;", 0x2323),
+  CHAR_REF("smt;", 0x2aaa),
+  CHAR_REF("smte;", 0x2aac),
+  MULTI_CHAR_REF("smtes;", 0x2aac, 0xfe00),
+  CHAR_REF("softcy;", 0x044c),
+  CHAR_REF("sol;", 0x2f),
+  CHAR_REF("solb;", 0x29c4),
+  CHAR_REF("solbar;", 0x233f),
+  CHAR_REF("sopf;", 0x0001d564),
+  CHAR_REF("spades;", 0x2660),
+  CHAR_REF("spadesuit;", 0x2660),
+  CHAR_REF("spar;", 0x2225),
+  CHAR_REF("sqcap;", 0x2293),
+  MULTI_CHAR_REF("sqcaps;", 0x2293, 0xfe00),
+  CHAR_REF("sqcup;", 0x2294),
+  MULTI_CHAR_REF("sqcups;", 0x2294, 0xfe00),
+  CHAR_REF("sqsub;", 0x228f),
+  CHAR_REF("sqsube;", 0x2291),
+  CHAR_REF("sqsubset;", 0x228f),
+  CHAR_REF("sqsubseteq;", 0x2291),
+  CHAR_REF("sqsup;", 0x2290),
+  CHAR_REF("sqsupe;", 0x2292),
+  CHAR_REF("sqsupset;", 0x2290),
+  CHAR_REF("sqsupseteq;", 0x2292),
+  CHAR_REF("squ;", 0x25a1),
+  CHAR_REF("square;", 0x25a1),
+  CHAR_REF("squarf;", 0x25aa),
+  CHAR_REF("squf;", 0x25aa),
+  CHAR_REF("srarr;", 0x2192),
+  CHAR_REF("sscr;", 0x0001d4c8),
+  CHAR_REF("ssetmn;", 0x2216),
+  CHAR_REF("ssmile;", 0x2323),
+  CHAR_REF("sstarf;", 0x22c6),
+  CHAR_REF("star;", 0x2606),
+  CHAR_REF("starf;", 0x2605),
+  CHAR_REF("straightepsilon;", 0x03f5),
+  CHAR_REF("straightphi;", 0x03d5),
+  CHAR_REF("strns;", 0xaf),
+  CHAR_REF("sub;", 0x2282),
+  CHAR_REF("subE;", 0x2ac5),
+  CHAR_REF("subdot;", 0x2abd),
+  CHAR_REF("sube;", 0x2286),
+  CHAR_REF("subedot;", 0x2ac3),
+  CHAR_REF("submult;", 0x2ac1),
+  CHAR_REF("subnE;", 0x2acb),
+  CHAR_REF("subne;", 0x228a),
+  CHAR_REF("subplus;", 0x2abf),
+  CHAR_REF("subrarr;", 0x2979),
+  CHAR_REF("subset;", 0x2282),
+  CHAR_REF("subseteq;", 0x2286),
+  CHAR_REF("subseteqq;", 0x2ac5),
+  CHAR_REF("subsetneq;", 0x228a),
+  CHAR_REF("subsetneqq;", 0x2acb),
+  CHAR_REF("subsim;", 0x2ac7),
+  CHAR_REF("subsub;", 0x2ad5),
+  CHAR_REF("subsup;", 0x2ad3),
+  CHAR_REF("succ;", 0x227b),
+  CHAR_REF("succapprox;", 0x2ab8),
+  CHAR_REF("succcurlyeq;", 0x227d),
+  CHAR_REF("succeq;", 0x2ab0),
+  CHAR_REF("succnapprox;", 0x2aba),
+  CHAR_REF("succneqq;", 0x2ab6),
+  CHAR_REF("succnsim;", 0x22e9),
+  CHAR_REF("succsim;", 0x227f),
+  CHAR_REF("sum;", 0x2211),
+  CHAR_REF("sung;", 0x266a),
+  CHAR_REF("sup1;", 0xb9),
+  CHAR_REF("sup1", 0xb9),
+  CHAR_REF("sup2;", 0xb2),
+  CHAR_REF("sup2", 0xb2),
+  CHAR_REF("sup3;", 0xb3),
+  CHAR_REF("sup3", 0xb3),
+  CHAR_REF("sup;", 0x2283),
+  CHAR_REF("supE;", 0x2ac6),
+  CHAR_REF("supdot;", 0x2abe),
+  CHAR_REF("supdsub;", 0x2ad8),
+  CHAR_REF("supe;", 0x2287),
+  CHAR_REF("supedot;", 0x2ac4),
+  CHAR_REF("suphsol;", 0x27c9),
+  CHAR_REF("suphsub;", 0x2ad7),
+  CHAR_REF("suplarr;", 0x297b),
+  CHAR_REF("supmult;", 0x2ac2),
+  CHAR_REF("supnE;", 0x2acc),
+  CHAR_REF("supne;", 0x228b),
+  CHAR_REF("supplus;", 0x2ac0),
+  CHAR_REF("supset;", 0x2283),
+  CHAR_REF("supseteq;", 0x2287),
+  CHAR_REF("supseteqq;", 0x2ac6),
+  CHAR_REF("supsetneq;", 0x228b),
+  CHAR_REF("supsetneqq;", 0x2acc),
+  CHAR_REF("supsim;", 0x2ac8),
+  CHAR_REF("supsub;", 0x2ad4),
+  CHAR_REF("supsup;", 0x2ad6),
+  CHAR_REF("swArr;", 0x21d9),
+  CHAR_REF("swarhk;", 0x2926),
+  CHAR_REF("swarr;", 0x2199),
+  CHAR_REF("swarrow;", 0x2199),
+  CHAR_REF("swnwar;", 0x292a),
+  CHAR_REF("szlig;", 0xdf),
+  CHAR_REF("szlig", 0xdf),
+  CHAR_REF("target;", 0x2316),
+  CHAR_REF("tau;", 0x03c4),
+  CHAR_REF("tbrk;", 0x23b4),
+  CHAR_REF("tcaron;", 0x0165),
+  CHAR_REF("tcedil;", 0x0163),
+  CHAR_REF("tcy;", 0x0442),
+  CHAR_REF("tdot;", 0x20db),
+  CHAR_REF("telrec;", 0x2315),
+  CHAR_REF("tfr;", 0x0001d531),
+  CHAR_REF("there4;", 0x2234),
+  CHAR_REF("therefore;", 0x2234),
+  CHAR_REF("theta;", 0x03b8),
+  CHAR_REF("thetasym;", 0x03d1),
+  CHAR_REF("thetav;", 0x03d1),
+  CHAR_REF("thickapprox;", 0x2248),
+  CHAR_REF("thicksim;", 0x223c),
+  CHAR_REF("thinsp;", 0x2009),
+  CHAR_REF("thkap;", 0x2248),
+  CHAR_REF("thksim;", 0x223c),
+  CHAR_REF("thorn;", 0xfe),
+  CHAR_REF("thorn", 0xfe),
+  CHAR_REF("tilde;", 0x02dc),
+  CHAR_REF("times;", 0xd7),
+  CHAR_REF("times", 0xd7),
+  CHAR_REF("timesb;", 0x22a0),
+  CHAR_REF("timesbar;", 0x2a31),
+  CHAR_REF("timesd;", 0x2a30),
+  CHAR_REF("tint;", 0x222d),
+  CHAR_REF("toea;", 0x2928),
+  CHAR_REF("top;", 0x22a4),
+  CHAR_REF("topbot;", 0x2336),
+  CHAR_REF("topcir;", 0x2af1),
+  CHAR_REF("topf;", 0x0001d565),
+  CHAR_REF("topfork;", 0x2ada),
+  CHAR_REF("tosa;", 0x2929),
+  CHAR_REF("tprime;", 0x2034),
+  CHAR_REF("trade;", 0x2122),
+  CHAR_REF("triangle;", 0x25b5),
+  CHAR_REF("triangledown;", 0x25bf),
+  CHAR_REF("triangleleft;", 0x25c3),
+  CHAR_REF("trianglelefteq;", 0x22b4),
+  CHAR_REF("triangleq;", 0x225c),
+  CHAR_REF("triangleright;", 0x25b9),
+  CHAR_REF("trianglerighteq;", 0x22b5),
+  CHAR_REF("tridot;", 0x25ec),
+  CHAR_REF("trie;", 0x225c),
+  CHAR_REF("triminus;", 0x2a3a),
+  CHAR_REF("triplus;", 0x2a39),
+  CHAR_REF("trisb;", 0x29cd),
+  CHAR_REF("tritime;", 0x2a3b),
+  CHAR_REF("trpezium;", 0x23e2),
+  CHAR_REF("tscr;", 0x0001d4c9),
+  CHAR_REF("tscy;", 0x0446),
+  CHAR_REF("tshcy;", 0x045b),
+  CHAR_REF("tstrok;", 0x0167),
+  CHAR_REF("twixt;", 0x226c),
+  CHAR_REF("twoheadleftarrow;", 0x219e),
+  CHAR_REF("twoheadrightarrow;", 0x21a0),
+  CHAR_REF("uArr;", 0x21d1),
+  CHAR_REF("uHar;", 0x2963),
+  CHAR_REF("uacute;", 0xfa),
+  CHAR_REF("uacute", 0xfa),
+  CHAR_REF("uarr;", 0x2191),
+  CHAR_REF("ubrcy;", 0x045e),
+  CHAR_REF("ubreve;", 0x016d),
+  CHAR_REF("ucirc;", 0xfb),
+  CHAR_REF("ucirc", 0xfb),
+  CHAR_REF("ucy;", 0x0443),
+  CHAR_REF("udarr;", 0x21c5),
+  CHAR_REF("udblac;", 0x0171),
+  CHAR_REF("udhar;", 0x296e),
+  CHAR_REF("ufisht;", 0x297e),
+  CHAR_REF("ufr;", 0x0001d532),
+  CHAR_REF("ugrave;", 0xf9),
+  CHAR_REF("ugrave", 0xf9),
+  CHAR_REF("uharl;", 0x21bf),
+  CHAR_REF("uharr;", 0x21be),
+  CHAR_REF("uhblk;", 0x2580),
+  CHAR_REF("ulcorn;", 0x231c),
+  CHAR_REF("ulcorner;", 0x231c),
+  CHAR_REF("ulcrop;", 0x230f),
+  CHAR_REF("ultri;", 0x25f8),
+  CHAR_REF("umacr;", 0x016b),
+  CHAR_REF("uml;", 0xa8),
+  CHAR_REF("uml", 0xa8),
+  CHAR_REF("uogon;", 0x0173),
+  CHAR_REF("uopf;", 0x0001d566),
+  CHAR_REF("uparrow;", 0x2191),
+  CHAR_REF("updownarrow;", 0x2195),
+  CHAR_REF("upharpoonleft;", 0x21bf),
+  CHAR_REF("upharpoonright;", 0x21be),
+  CHAR_REF("uplus;", 0x228e),
+  CHAR_REF("upsi;", 0x03c5),
+  CHAR_REF("upsih;", 0x03d2),
+  CHAR_REF("upsilon;", 0x03c5),
+  CHAR_REF("upuparrows;", 0x21c8),
+  CHAR_REF("urcorn;", 0x231d),
+  CHAR_REF("urcorner;", 0x231d),
+  CHAR_REF("urcrop;", 0x230e),
+  CHAR_REF("uring;", 0x016f),
+  CHAR_REF("urtri;", 0x25f9),
+  CHAR_REF("uscr;", 0x0001d4ca),
+  CHAR_REF("utdot;", 0x22f0),
+  CHAR_REF("utilde;", 0x0169),
+  CHAR_REF("utri;", 0x25b5),
+  CHAR_REF("utrif;", 0x25b4),
+  CHAR_REF("uuarr;", 0x21c8),
+  CHAR_REF("uuml;", 0xfc),
+  CHAR_REF("uuml", 0xfc),
+  CHAR_REF("uwangle;", 0x29a7),
+  CHAR_REF("vArr;", 0x21d5),
+  CHAR_REF("vBar;", 0x2ae8),
+  CHAR_REF("vBarv;", 0x2ae9),
+  CHAR_REF("vDash;", 0x22a8),
+  CHAR_REF("vangrt;", 0x299c),
+  CHAR_REF("varepsilon;", 0x03f5),
+  CHAR_REF("varkappa;", 0x03f0),
+  CHAR_REF("varnothing;", 0x2205),
+  CHAR_REF("varphi;", 0x03d5),
+  CHAR_REF("varpi;", 0x03d6),
+  CHAR_REF("varpropto;", 0x221d),
+  CHAR_REF("varr;", 0x2195),
+  CHAR_REF("varrho;", 0x03f1),
+  CHAR_REF("varsigma;", 0x03c2),
+  MULTI_CHAR_REF("varsubsetneq;", 0x228a, 0xfe00),
+  MULTI_CHAR_REF("varsubsetneqq;", 0x2acb, 0xfe00),
+  MULTI_CHAR_REF("varsupsetneq;", 0x228b, 0xfe00),
+  MULTI_CHAR_REF("varsupsetneqq;", 0x2acc, 0xfe00),
+  CHAR_REF("vartheta;", 0x03d1),
+  CHAR_REF("vartriangleleft;", 0x22b2),
+  CHAR_REF("vartriangleright;", 0x22b3),
+  CHAR_REF("vcy;", 0x0432),
+  CHAR_REF("vdash;", 0x22a2),
+  CHAR_REF("vee;", 0x2228),
+  CHAR_REF("veebar;", 0x22bb),
+  CHAR_REF("veeeq;", 0x225a),
+  CHAR_REF("vellip;", 0x22ee),
+  CHAR_REF("verbar;", 0x7c),
+  CHAR_REF("vert;", 0x7c),
+  CHAR_REF("vfr;", 0x0001d533),
+  CHAR_REF("vltri;", 0x22b2),
+  MULTI_CHAR_REF("vnsub;", 0x2282, 0x20d2),
+  MULTI_CHAR_REF("vnsup;", 0x2283, 0x20d2),
+  CHAR_REF("vopf;", 0x0001d567),
+  CHAR_REF("vprop;", 0x221d),
+  CHAR_REF("vrtri;", 0x22b3),
+  CHAR_REF("vscr;", 0x0001d4cb),
+  MULTI_CHAR_REF("vsubnE;", 0x2acb, 0xfe00),
+  MULTI_CHAR_REF("vsubne;", 0x228a, 0xfe00),
+  MULTI_CHAR_REF("vsupnE;", 0x2acc, 0xfe00),
+  MULTI_CHAR_REF("vsupne;", 0x228b, 0xfe00),
+  CHAR_REF("vzigzag;", 0x299a),
+  CHAR_REF("wcirc;", 0x0175),
+  CHAR_REF("wedbar;", 0x2a5f),
+  CHAR_REF("wedge;", 0x2227),
+  CHAR_REF("wedgeq;", 0x2259),
+  CHAR_REF("weierp;", 0x2118),
+  CHAR_REF("wfr;", 0x0001d534),
+  CHAR_REF("wopf;", 0x0001d568),
+  CHAR_REF("wp;", 0x2118),
+  CHAR_REF("wr;", 0x2240),
+  CHAR_REF("wreath;", 0x2240),
+  CHAR_REF("wscr;", 0x0001d4cc),
+  CHAR_REF("xcap;", 0x22c2),
+  CHAR_REF("xcirc;", 0x25ef),
+  CHAR_REF("xcup;", 0x22c3),
+  CHAR_REF("xdtri;", 0x25bd),
+  CHAR_REF("xfr;", 0x0001d535),
+  CHAR_REF("xhArr;", 0x27fa),
+  CHAR_REF("xharr;", 0x27f7),
+  CHAR_REF("xi;", 0x03be),
+  CHAR_REF("xlArr;", 0x27f8),
+  CHAR_REF("xlarr;", 0x27f5),
+  CHAR_REF("xmap;", 0x27fc),
+  CHAR_REF("xnis;", 0x22fb),
+  CHAR_REF("xodot;", 0x2a00),
+  CHAR_REF("xopf;", 0x0001d569),
+  CHAR_REF("xoplus;", 0x2a01),
+  CHAR_REF("xotime;", 0x2a02),
+  CHAR_REF("xrArr;", 0x27f9),
+  CHAR_REF("xrarr;", 0x27f6),
+  CHAR_REF("xscr;", 0x0001d4cd),
+  CHAR_REF("xsqcup;", 0x2a06),
+  CHAR_REF("xuplus;", 0x2a04),
+  CHAR_REF("xutri;", 0x25b3),
+  CHAR_REF("xvee;", 0x22c1),
+  CHAR_REF("xwedge;", 0x22c0),
+  CHAR_REF("yacute;", 0xfd),
+  CHAR_REF("yacute", 0xfd),
+  CHAR_REF("yacy;", 0x044f),
+  CHAR_REF("ycirc;", 0x0177),
+  CHAR_REF("ycy;", 0x044b),
+  CHAR_REF("yen;", 0xa5),
+  CHAR_REF("yen", 0xa5),
+  CHAR_REF("yfr;", 0x0001d536),
+  CHAR_REF("yicy;", 0x0457),
+  CHAR_REF("yopf;", 0x0001d56a),
+  CHAR_REF("yscr;", 0x0001d4ce),
+  CHAR_REF("yucy;", 0x044e),
+  CHAR_REF("yuml;", 0xff),
+  CHAR_REF("yuml", 0xff),
+  CHAR_REF("zacute;", 0x017a),
+  CHAR_REF("zcaron;", 0x017e),
+  CHAR_REF("zcy;", 0x0437),
+  CHAR_REF("zdot;", 0x017c),
+  CHAR_REF("zeetrf;", 0x2128),
+  CHAR_REF("zeta;", 0x03b6),
+  CHAR_REF("zfr;", 0x0001d537),
+  CHAR_REF("zhcy;", 0x0436),
+  CHAR_REF("zigrarr;", 0x21dd),
+  CHAR_REF("zopf;", 0x0001d56b),
+  CHAR_REF("zscr;", 0x0001d4cf),
+  CHAR_REF("zwj;", 0x200d),
+  CHAR_REF("zwnj;", 0x200c),
+  // Terminator.
+  CHAR_REF("", -1)
+};
+
+// Table of replacement characters.  The spec specifies that any occurrence of
+// the first character should be replaced by the second character, and a parse
+// error recorded.
+typedef struct {
+  int from_char;
+  int to_char;
+} CharReplacement;
+
+static const CharReplacement kCharReplacements[] = {
+  { 0x00, 0xfffd },
+  { 0x0d, 0x000d },
+  { 0x80, 0x20ac },
+  { 0x81, 0x0081 },
+  { 0x82, 0x201A },
+  { 0x83, 0x0192 },
+  { 0x84, 0x201E },
+  { 0x85, 0x2026 },
+  { 0x86, 0x2020 },
+  { 0x87, 0x2021 },
+  { 0x88, 0x02C6 },
+  { 0x89, 0x2030 },
+  { 0x8A, 0x0160 },
+  { 0x8B, 0x2039 },
+  { 0x8C, 0x0152 },
+  { 0x8D, 0x008D },
+  { 0x8E, 0x017D },
+  { 0x8F, 0x008F },
+  { 0x90, 0x0090 },
+  { 0x91, 0x2018 },
+  { 0x92, 0x2019 },
+  { 0x93, 0x201C },
+  { 0x94, 0x201D },
+  { 0x95, 0x2022 },
+  { 0x96, 0x2013 },
+  { 0x97, 0x2014 },
+  { 0x98, 0x02DC },
+  { 0x99, 0x2122 },
+  { 0x9A, 0x0161 },
+  { 0x9B, 0x203A },
+  { 0x9C, 0x0153 },
+  { 0x9D, 0x009D },
+  { 0x9E, 0x017E },
+  { 0x9F, 0x0178 },
+  // Terminator.
+  { -1, -1 }
+};
+
+static int parse_digit(int c, bool allow_hex) {
+  if (c >= '0' && c <= '9') {
+    return c - '0';
+  }
+  if (allow_hex && c >= 'a' && c <= 'f') {
+    return c - 'a' + 10;
+  }
+  if (allow_hex && c >= 'A' && c <= 'F') {
+    return c - 'A' + 10;
+  }
+  return -1;
+}
+
+static void add_no_digit_error(
+    struct GumboInternalParser* parser, Utf8Iterator* input) {
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  utf8iterator_fill_error_at_mark(input, error);
+  error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
+}
+
+static void add_codepoint_error(
+    struct GumboInternalParser* parser, Utf8Iterator* input,
+    GumboErrorType type, int codepoint) {
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  utf8iterator_fill_error_at_mark(input, error);
+  error->type = type;
+  error->v.codepoint = codepoint;
+}
+
+static void add_named_reference_error(
+    struct GumboInternalParser* parser, Utf8Iterator* input,
+    GumboErrorType type, GumboStringPiece text) {
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  utf8iterator_fill_error_at_mark(input, error);
+  error->type = type;
+  error->v.text = text;
+}
+
+static int maybe_replace_codepoint(int codepoint) {
+  for (int i = 0; kCharReplacements[i].from_char != -1; ++i) {
+    if (kCharReplacements[i].from_char == codepoint) {
+      return kCharReplacements[i].to_char;
+    }
+  }
+  return -1;
+}
+
+static bool consume_numeric_ref(
+    struct GumboInternalParser* parser, Utf8Iterator* input, int* output) {
+  utf8iterator_next(input);
+  bool is_hex = false;
+  int c = utf8iterator_current(input);
+  if (c == 'x' || c == 'X') {
+    is_hex = true;
+    utf8iterator_next(input);
+    c = utf8iterator_current(input);
+  }
+
+  int digit = parse_digit(c, is_hex);
+  if (digit == -1) {
+    // First digit was invalid; add a parse error and return.
+    add_no_digit_error(parser, input);
+    utf8iterator_reset(input);
+    *output = kGumboNoChar;
+    return false;
+  }
+
+  int codepoint = 0;
+  bool status = true;
+  do {
+    codepoint = (codepoint * (is_hex ? 16 : 10)) + digit;
+    utf8iterator_next(input);
+    digit = parse_digit(utf8iterator_current(input), is_hex);
+  } while (digit != -1);
+
+  if (utf8iterator_current(input) != ';') {
+    add_codepoint_error(
+        parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, codepoint);
+    status = false;
+  } else {
+    utf8iterator_next(input);
+  }
+
+  int replacement = maybe_replace_codepoint(codepoint);
+  if (replacement != -1) {
+    add_codepoint_error(
+        parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint);
+    *output = replacement;
+    return false;
+  }
+
+  if ((codepoint >= 0xd800 && codepoint <= 0xdfff) || codepoint > 0x10ffff) {
+    add_codepoint_error(
+        parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint);
+    *output = 0xfffd;
+    return false;
+  }
+
+  if (utf8_is_invalid_code_point(codepoint) || codepoint == 0xb) {
+    add_codepoint_error(
+        parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint);
+    status = false;
+    // But return it anyway, per spec.
+  }
+  *output = codepoint;
+  return status;
+}
+
+static const NamedCharRef* find_named_char_ref(Utf8Iterator* input) {
+  for (int i = 0; kNamedEntities[i].codepoints.first != -1; ++i) {
+    const NamedCharRef* current = &kNamedEntities[i];
+    assert(strlen(current->name) == current->length);
+    if (utf8iterator_maybe_consume_match(
+        input, current->name, current->length, true)) {
+      assert(current->name != NULL);
+      assert(current->length > 0);
+      assert(current->codepoints.first != kGumboNoChar);
+      return current;
+    }
+  }
+  return NULL;
+}
+
+static bool is_legal_attribute_char_next(Utf8Iterator* input) {
+  int c = utf8iterator_current(input);
+  return c == '=' || isalnum(c);
+}
+
+static bool maybe_add_invalid_named_reference(
+    struct GumboInternalParser* parser, Utf8Iterator* input) {
+  // The iterator will always be reset in this code path, so we don't need to
+  // worry about consuming characters.
+  const char* start = utf8iterator_get_char_pointer(input);
+  int c = utf8iterator_current(input);
+  while ((c >= 'a' && c <= 'z') ||
+         (c >= 'A' && c <= 'Z') ||
+         (c >= '0' && c <= '9')) {
+    utf8iterator_next(input);
+    c = utf8iterator_current(input);
+  }
+  if (c == ';') {
+    GumboStringPiece bad_ref;
+    bad_ref.data = start;
+    bad_ref.length = utf8iterator_get_char_pointer(input) - start;
+    add_named_reference_error(
+        parser, input, GUMBO_ERR_NAMED_CHAR_REF_INVALID, bad_ref);
+    return false;
+  }
+  return true;
+}
+
+static bool consume_named_ref(
+    struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
+    OneOrTwoCodepoints* output) {
+  assert(output->first == kGumboNoChar);
+  const NamedCharRef* char_ref = find_named_char_ref(input);
+  if (char_ref) {
+    assert(char_ref->length == strlen(char_ref->name));
+    char last_char = char_ref->name[char_ref->length - 1];
+    if (last_char == ';') {
+      *output = char_ref->codepoints;
+      assert(output->first != kGumboNoChar);
+      return true;
+    } else if (is_in_attribute && is_legal_attribute_char_next(input)) {
+      utf8iterator_reset(input);
+      return true;
+    } else {
+      GumboStringPiece bad_ref;
+      bad_ref.data = char_ref->name;
+      bad_ref.length = char_ref->length;
+      add_named_reference_error(
+          parser, input, GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, bad_ref);
+      *output = char_ref->codepoints;
+      assert(output->first != kGumboNoChar);
+      return false;
+    }
+  } else {
+    bool status = maybe_add_invalid_named_reference(parser, input);
+    utf8iterator_reset(input);
+    return status;
+  }
+}
+
+bool consume_char_ref(
+    struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
+    int additional_allowed_char, bool is_in_attribute,
+    OneOrTwoCodepoints* output) {
+  utf8iterator_mark(input);
+  utf8iterator_next(input);
+  int c = utf8iterator_current(input);
+  output->first = kGumboNoChar;
+  output->second = kGumboNoChar;
+  if (c == additional_allowed_char) {
+    utf8iterator_reset(input);
+    output->first = kGumboNoChar;
+    return true;
+  }
+  switch (utf8iterator_current(input)) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+    case '<':
+    case '&':
+    case -1:
+      utf8iterator_reset(input);
+      return true;
+    case '#':
+      return consume_numeric_ref(parser, input, &output->first);
+    default:
+      return consume_named_ref(parser, input, is_in_attribute, output);
+  }
+}

+ 61 - 0
gumbo/char_ref.h

@@ -0,0 +1,61 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// Internal header for character reference handling; this should not be exposed
+// transitively by any public API header.  This is why the functions aren't
+// namespaced.
+
+#ifndef GUMBO_CHAR_REF_H_
+#define GUMBO_CHAR_REF_H_
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+struct GumboInternalUtf8Iterator;
+
+// Value that indicates no character was produced.
+extern const int kGumboNoChar;
+
+// Certain named character references generate two codepoints, not one, and so
+// the consume_char_ref subroutine needs to return this instead of an int.  The
+// first field will be kGumboNoChar if no character reference was found; the
+// second field will be kGumboNoChar if that is the case or if the character
+// reference returns only a single codepoint.
+typedef struct {
+  int first;
+  int second;
+} OneOrTwoCodepoints;
+
+// Implements the "consume a character reference" section of the spec.
+// This reads in characters from the input as necessary, and fills in a
+// OneOrTwoCodepoints struct containing the characters read.  It may add parse
+// errors to the GumboParser's errors vector, if the spec calls for it.  Pass a
+// space for the "additional allowed char" when the spec says "with no
+// additional allowed char".  Returns false on parse error, true otherwise.
+bool consume_char_ref(
+    struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
+    int additional_allowed_char, bool is_in_attribute,
+    OneOrTwoCodepoints* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_CHAR_REF_H_

+ 258 - 0
gumbo/error.c

@@ -0,0 +1,258 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "error.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "gumbo.h"
+#include "parser.h"
+#include "string_buffer.h"
+#include "util.h"
+#include "vector.h"
+
+static const size_t kMessageBufferSize = 256;
+
+// Prints a formatted message to a StringBuffer.  This automatically resizes the
+// StringBuffer as necessary to fit the message.  Returns the number of bytes
+// written.
+static int print_message(GumboParser* parser, GumboStringBuffer* output,
+                         const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  int remaining_capacity = output->capacity - output->length;
+  int bytes_written = vsnprintf(output->data + output->length,
+                                remaining_capacity, format, args);
+  if (bytes_written > remaining_capacity) {
+    gumbo_string_buffer_reserve(
+        parser, output->capacity + bytes_written, output);
+    remaining_capacity = output->capacity - output->length;
+    bytes_written = vsnprintf(output->data + output->length,
+                              remaining_capacity, format, args);
+  }
+  output->length += bytes_written;
+  va_end(args);
+  return bytes_written;
+}
+
+static void print_tag_stack(
+    GumboParser* parser, const GumboParserError* error,
+    GumboStringBuffer* output) {
+  print_message(parser, output, "  Currently open tags: ");
+  for (int i = 0; i < error->tag_stack.length; ++i) {
+    if (i) {
+      print_message(parser, output, ", ");
+    }
+    GumboTag tag = (GumboTag) error->tag_stack.data[i];
+    print_message(parser, output, gumbo_normalized_tagname(tag));
+  }
+  gumbo_string_buffer_append_codepoint(parser, '.', output);
+}
+
+static void handle_parser_error(GumboParser* parser,
+                                const GumboParserError* error,
+                                GumboStringBuffer* output) {
+  if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
+      error->input_type != GUMBO_TOKEN_DOCTYPE) {
+    print_message(parser, output,
+                  "The doctype must be the first token in the document");
+    return;
+  }
+
+  switch (error->input_type) {
+    case GUMBO_TOKEN_DOCTYPE:
+      print_message(parser, output, "This is not a legal doctype");
+      return;
+    case GUMBO_TOKEN_COMMENT:
+      // Should never happen; comments are always legal.
+      assert(0);
+      // But just in case...
+      print_message(parser, output, "Comments aren't legal here");
+      return;
+    case GUMBO_TOKEN_WHITESPACE:
+    case GUMBO_TOKEN_CHARACTER:
+      print_message(parser, output, "Character tokens aren't legal here");
+      return;
+    case GUMBO_TOKEN_NULL:
+      print_message(parser, output, "Null bytes are not allowed in HTML5");
+      return;
+    case GUMBO_TOKEN_EOF:
+      if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
+        print_message(parser, output, "You must provide a doctype");
+      } else {
+        print_message(parser, output, "Premature end of file");
+        print_tag_stack(parser, error, output);
+      }
+      return;
+    case GUMBO_TOKEN_START_TAG:
+    case GUMBO_TOKEN_END_TAG:
+      print_message(parser, output, "That tag isn't allowed here");
+      print_tag_stack(parser, error, output);
+      // TODO(jdtang): Give more specific messaging.
+      return;
+  }
+}
+
+// Finds the preceding newline in an original source buffer from a given byte
+// location.  Returns a character pointer to the character after that, or a
+// pointer to the beginning of the string if this is the first line.
+static const char* find_last_newline(
+    const char* original_text, const char* error_location) {
+  assert(error_location >= original_text);
+  const char* c = error_location;
+  for (; c != original_text && *c != '\n'; --c) {
+    // There may be an error at EOF, which would be a nul byte.
+    assert(*c || c == error_location);
+  }
+  return c == original_text ? c : c + 1;
+}
+
+// Finds the next newline in the original source buffer from a given byte
+// location.  Returns a character pointer to that newline, or a pointer to the
+// terminating null byte if this is the last line.
+static const char* find_next_newline(
+    const char* original_text, const char* error_location) {
+  const char* c = error_location;
+  for (; *c && *c != '\n'; ++c);
+  return c;
+}
+
+GumboError* gumbo_add_error(GumboParser* parser) {
+  int max_errors = parser->_options->max_errors;
+  if (max_errors < 0 && parser->_output->errors.length >= max_errors) {
+    return NULL;
+  }
+  GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
+  gumbo_vector_add(parser, error, &parser->_output->errors);
+  return error;
+}
+
+void gumbo_error_to_string(
+    GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
+  print_message(parser, output, "@%d:%d: ",
+                error->position.line, error->position.column);
+  switch (error->type) {
+    case GUMBO_ERR_UTF8_INVALID:
+      print_message(parser, output, "Invalid UTF8 character 0x%x",
+               error->v.codepoint);
+      break;
+    case GUMBO_ERR_UTF8_TRUNCATED:
+      print_message(parser, output,
+               "Input stream ends with a truncated UTF8 character 0x%x",
+               error->v.codepoint);
+      break;
+    case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
+      print_message(parser, output,
+               "No digits after &# in numeric character reference");
+      break;
+    case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
+      print_message(parser, output,
+               "The numeric character reference &#%d should be followed "
+               "by a semicolon", error->v.codepoint);
+      break;
+    case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
+      print_message(parser, output,
+               "The numeric character reference &#%d; encodes an invalid "
+               "unicode codepoint", error->v.codepoint);
+      break;
+    case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
+      // The textual data came from one of the literal strings in the table, and
+      // so it'll be null-terminated.
+      print_message(parser, output,
+               "The named character reference &%.*s should be followed by a "
+               "semicolon", (int) error->v.text.length, error->v.text.data);
+      break;
+    case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
+      print_message(parser, output,
+               "The named character reference &%.*s; is not a valid entity name",
+               (int) error->v.text.length, error->v.text.data);
+      break;
+    case GUMBO_ERR_DUPLICATE_ATTR:
+      print_message(parser, output,
+               "Attribute %s occurs multiple times, at positions %d and %d",
+               error->v.duplicate_attr.name,
+               error->v.duplicate_attr.original_index,
+               error->v.duplicate_attr.new_index);
+      break;
+    case GUMBO_ERR_PARSER:
+    case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
+      handle_parser_error(parser, &error->v.parser, output);
+      break;
+    default:
+      print_message(parser, output,
+               "Tokenizer error with an unimplemented error message");
+      break;
+  }
+  gumbo_string_buffer_append_codepoint(parser, '.', output);
+}
+
+void gumbo_caret_diagnostic_to_string(
+    GumboParser* parser, const GumboError* error,
+    const char* source_text, GumboStringBuffer* output) {
+  gumbo_error_to_string(parser, error, output);
+
+  const char* line_start =
+      find_last_newline(source_text, error->original_text);
+  const char* line_end =
+      find_next_newline(source_text, error->original_text);
+  GumboStringPiece original_line;
+  original_line.data = line_start;
+  original_line.length = line_end - line_start;
+
+  gumbo_string_buffer_append_codepoint(parser, '\n', output);
+  gumbo_string_buffer_append_string(parser, &original_line, output);
+  gumbo_string_buffer_append_codepoint(parser, '\n', output);
+  gumbo_string_buffer_reserve(
+      parser, output->length + error->position.column, output);
+  int num_spaces = error->position.column - 1;
+  memset(output->data + output->length, ' ', num_spaces);
+  output->length += num_spaces;
+  gumbo_string_buffer_append_codepoint(parser, '^', output);
+  gumbo_string_buffer_append_codepoint(parser, '\n', output);
+}
+
+void gumbo_print_caret_diagnostic(
+    GumboParser* parser, const GumboError* error, const char* source_text) {
+  GumboStringBuffer text;
+  gumbo_string_buffer_init(parser, &text);
+  gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
+  printf("%.*s", (int) text.length, text.data);
+  gumbo_string_buffer_destroy(parser, &text);
+}
+
+void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
+  if (error->type == GUMBO_ERR_PARSER ||
+      error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
+    gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
+  } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
+    gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
+  }
+  gumbo_parser_deallocate(parser, error);
+}
+
+void gumbo_init_errors(GumboParser* parser) {
+  gumbo_vector_init(parser, 5, &parser->_output->errors);
+}
+
+void gumbo_destroy_errors(GumboParser* parser) {
+  for (int i = 0; i < parser->_output->errors.length; ++i) {
+    gumbo_error_destroy(parser, parser->_output->errors.data[i]);
+  }
+  gumbo_vector_destroy(parser, &parser->_output->errors);
+}

+ 225 - 0
gumbo/error.h

@@ -0,0 +1,225 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// Error types, enums, and handling functions.
+
+#ifndef GUMBO_ERROR_H_
+#define GUMBO_ERROR_H_
+
+#include <stdint.h>
+
+#include "gumbo.h"
+#include "insertion_mode.h"
+#include "string_buffer.h"
+#include "token_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+
+typedef enum {
+  GUMBO_ERR_UTF8_INVALID,
+  GUMBO_ERR_UTF8_TRUNCATED,
+  GUMBO_ERR_UTF8_NULL,
+  GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
+  GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
+  GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
+  GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
+  GUMBO_ERR_NAMED_CHAR_REF_INVALID,
+  GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
+  GUMBO_ERR_TAG_EOF,
+  GUMBO_ERR_TAG_INVALID,
+  GUMBO_ERR_CLOSE_TAG_EMPTY,
+  GUMBO_ERR_CLOSE_TAG_EOF,
+  GUMBO_ERR_CLOSE_TAG_INVALID,
+  GUMBO_ERR_SCRIPT_EOF,
+  GUMBO_ERR_ATTR_NAME_EOF,
+  GUMBO_ERR_ATTR_NAME_INVALID,
+  GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
+  GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
+  GUMBO_ERR_ATTR_UNQUOTED_EOF,
+  GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
+  GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
+  GUMBO_ERR_ATTR_AFTER_EOF,
+  GUMBO_ERR_ATTR_AFTER_INVALID,
+  GUMBO_ERR_DUPLICATE_ATTR,
+  GUMBO_ERR_SOLIDUS_EOF,
+  GUMBO_ERR_SOLIDUS_INVALID,
+  GUMBO_ERR_DASHES_OR_DOCTYPE,
+  GUMBO_ERR_COMMENT_EOF,
+  GUMBO_ERR_COMMENT_INVALID,
+  GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
+  GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
+  GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
+  GUMBO_ERR_COMMENT_END_BANG_EOF,
+  GUMBO_ERR_DOCTYPE_EOF,
+  GUMBO_ERR_DOCTYPE_INVALID,
+  GUMBO_ERR_DOCTYPE_SPACE,
+  GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
+  GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
+  GUMBO_ERR_DOCTYPE_END,
+  GUMBO_ERR_PARSER,
+  GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
+} GumboErrorType;
+
+// Additional data for duplicated attributes.
+typedef struct GumboInternalDuplicateAttrError {
+  // The name of the attribute.  Owned by this struct.
+  const char* name;
+
+  // The (0-based) index within the attributes vector of the original
+  // occurrence.
+  unsigned int original_index;
+
+  // The (0-based) index where the new occurrence would be.
+  unsigned int new_index;
+} GumboDuplicateAttrError;
+
+// A simplified representation of the tokenizer state, designed to be more
+// useful to clients of this library than the internal representation.  This
+// condenses the actual states used in the tokenizer state machine into a few
+// values that will be familiar to users of HTML.
+typedef enum {
+  GUMBO_ERR_TOKENIZER_DATA,
+  GUMBO_ERR_TOKENIZER_CHAR_REF,
+  GUMBO_ERR_TOKENIZER_RCDATA,
+  GUMBO_ERR_TOKENIZER_RAWTEXT,
+  GUMBO_ERR_TOKENIZER_PLAINTEXT,
+  GUMBO_ERR_TOKENIZER_SCRIPT,
+  GUMBO_ERR_TOKENIZER_TAG,
+  GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
+  GUMBO_ERR_TOKENIZER_ATTR_NAME,
+  GUMBO_ERR_TOKENIZER_ATTR_VALUE,
+  GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
+  GUMBO_ERR_TOKENIZER_COMMENT,
+  GUMBO_ERR_TOKENIZER_DOCTYPE,
+  GUMBO_ERR_TOKENIZER_CDATA,
+} GumboTokenizerErrorState;
+
+// Additional data for tokenizer errors.
+// This records the current state and codepoint encountered - this is usually
+// enough to reconstruct what went wrong and provide a friendly error message.
+typedef struct GumboInternalTokenizerError {
+  // The bad codepoint encountered.
+  int codepoint;
+
+  // The state that the tokenizer was in at the time.
+  GumboTokenizerErrorState state;
+} GumboTokenizerError;
+
+// Additional data for parse errors.
+typedef struct GumboInternalParserError {
+  // The type of input token that resulted in this error.
+  GumboTokenType input_type;
+
+  // The HTML tag of the input token.  TAG_UNKNOWN if this was not a tag token.
+  GumboTag input_tag;
+
+  // The insertion mode that the parser was in at the time.
+  GumboInsertionMode parser_state;
+
+  // The tag stack at the point of the error.  Note that this is an GumboVector
+  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
+  // get at the tag.
+  GumboVector /* GumboTag */ tag_stack;
+} GumboParserError;
+
+// The overall error struct representing an error in decoding/tokenizing/parsing
+// the HTML.  This contains an enumerated type flag, a source position, and then
+// a union of fields containing data specific to the error.
+typedef struct GumboInternalError {
+  // The type of error.
+  GumboErrorType type;
+
+  // The position within the source file where the error occurred.
+  GumboSourcePosition position;
+
+  // A pointer to the byte within the original source file text where the error
+  // occurred (note that this is not the same as position.offset, as that gives
+  // character-based instead of byte-based offsets).
+  const char* original_text;
+
+  // Type-specific error information.
+  union {
+    // The code point we encountered, for:
+    // * GUMBO_ERR_UTF8_INVALID
+    // * GUMBO_ERR_UTF8_TRUNCATED
+    // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
+    // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
+    uint64_t codepoint;
+
+    // Tokenizer errors.
+    GumboTokenizerError tokenizer;
+
+    // Short textual data, for:
+    // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
+    // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
+    GumboStringPiece text;
+
+    // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
+    GumboDuplicateAttrError duplicate_attr;
+
+    // Parser state, for GUMBO_ERR_PARSER and
+    // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
+    struct GumboInternalParserError parser;
+  } v;
+} GumboError;
+
+// Adds a new error to the parser's error list, and returns a pointer to it so
+// that clients can fill out the rest of its fields.  May return NULL if we're
+// already over the max_errors field specified in GumboOptions.
+GumboError* gumbo_add_error(struct GumboInternalParser* parser);
+
+// Initializes the errors vector in the parser.
+void gumbo_init_errors(struct GumboInternalParser* errors);
+
+// Frees all the errors in the 'errors_' field of the parser.
+void gumbo_destroy_errors(struct GumboInternalParser* errors);
+
+// Frees the memory used for a single GumboError.
+void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
+
+// Prints an error to a string.  This fills an empty GumboStringBuffer with a
+// freshly-allocated buffer containing the error message text.  The caller is
+// responsible for deleting the buffer.  (Note that the buffer is allocated with
+// the allocator specified in the GumboParser config and hence should be freed
+// by gumbo_parser_deallocate().)
+void gumbo_error_to_string(
+    struct GumboInternalParser* parser, const GumboError* error,
+    GumboStringBuffer* output);
+
+// Prints a caret diagnostic to a string.  This fills an empty GumboStringBuffer
+// with a freshly-allocated buffer containing the error message text.  The
+// caller is responsible for deleting the buffer.  (Note that the buffer is
+// allocated with the allocator specified in the GumboParser config and hence
+// should be freed by gumbo_parser_deallocate().)
+void gumbo_caret_diagnostic_to_string(
+    struct GumboInternalParser* parser, const GumboError* error,
+    const char* source_text, GumboStringBuffer* output);
+
+// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
+// of writing to a string.
+void gumbo_print_caret_diagnostic(
+    struct GumboInternalParser* parser, const GumboError* error,
+    const char* source_text);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_ERROR_H_

+ 97 - 0
gumbo/gumbo.cbp

@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="gumbo" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="libgumbo" prefix_auto="1" extension_auto="1" />
+				<Option working_dir="" />
+				<Option object_output="obj/Debug/" />
+				<Option type="2" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Compiler>
+					<Add option="-Wall" />
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="libgumbo" prefix_auto="1" extension_auto="1" />
+				<Option working_dir="" />
+				<Option object_output="obj/Release/" />
+				<Option type="2" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Compiler>
+					<Add option="-O2" />
+					<Add option="-Wall" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-std=gnu99" />
+		</Compiler>
+		<Unit filename="attribute.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="attribute.h" />
+		<Unit filename="char_ref.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="char_ref.h" />
+		<Unit filename="error.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="error.h" />
+		<Unit filename="gumbo.h" />
+		<Unit filename="insertion_mode.h" />
+		<Unit filename="main.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="parser.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="parser.h" />
+		<Unit filename="string_buffer.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="string_buffer.h" />
+		<Unit filename="string_piece.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="string_piece.h" />
+		<Unit filename="tag.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="token_type.h" />
+		<Unit filename="tokenizer.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="tokenizer.h" />
+		<Unit filename="tokenizer_states.h" />
+		<Unit filename="utf8.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="utf8.h" />
+		<Unit filename="util.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="util.h" />
+		<Unit filename="vector.c">
+			<Option compilerVar="CC" />
+		</Unit>
+		<Unit filename="vector.h" />
+		<Extensions>
+			<code_completion />
+			<envvars />
+			<lib_finder disable_auto="1" />
+			<debugger />
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>

+ 174 - 0
gumbo/gumbo.depend

@@ -0,0 +1,174 @@
+# depslib dependency file v1.0
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/attribute.c
+	"attribute.h"
+	<assert.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	"util.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/attribute.h
+	"gumbo.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/gumbo.h
+	<stdbool.h>
+	<stddef.h>
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/util.h
+	<stdbool.h>
+	<stddef.h>
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/char_ref.c
+	"char_ref.h"
+	<assert.h>
+	<ctype.h>
+	<stddef.h>
+	<string.h>
+	"error.h"
+	"string_piece.h"
+	"utf8.h"
+	"util.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/char_ref.h
+	<stdbool.h>
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/error.h
+	<stdint.h>
+	"gumbo.h"
+	"insertion_mode.h"
+	"string_buffer.h"
+	"token_type.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/insertion_mode.h
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/string_buffer.h
+	<stdbool.h>
+	<stddef.h>
+	"gumbo.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/token_type.h
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/string_piece.h
+	"gumbo.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/utf8.h
+	<stdbool.h>
+	<stddef.h>
+	"gumbo.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/error.c
+	"error.h"
+	<assert.h>
+	<stdarg.h>
+	<stdio.h>
+	<string.h>
+	"gumbo.h"
+	"parser.h"
+	"string_buffer.h"
+	"util.h"
+	"vector.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/parser.h
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/vector.h
+	"gumbo.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/parser.c
+	<assert.h>
+	<ctype.h>
+	<stdarg.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	"attribute.h"
+	"error.h"
+	"gumbo.h"
+	"insertion_mode.h"
+	"parser.h"
+	"tokenizer.h"
+	"tokenizer_states.h"
+	"utf8.h"
+	"util.h"
+	"vector.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/tokenizer.h
+	<stdbool.h>
+	<stddef.h>
+	"gumbo.h"
+	"token_type.h"
+	"tokenizer_states.h"
+
+1380152090 /home/mingo/dev/SquiLu/gumbo/tokenizer_states.h
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/string_buffer.c
+	"string_buffer.h"
+	<assert.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	"string_piece.h"
+	"util.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/string_piece.c
+	"string_piece.h"
+	<assert.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	"util.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/tag.c
+	"gumbo.h"
+	<assert.h>
+	<ctype.h>
+	<strings.h>
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/tokenizer.c
+	"tokenizer.h"
+	<assert.h>
+	<stdbool.h>
+	<string.h>
+	"attribute.h"
+	"char_ref.h"
+	"error.h"
+	"gumbo.h"
+	"parser.h"
+	"string_buffer.h"
+	"string_piece.h"
+	"token_type.h"
+	"tokenizer_states.h"
+	"utf8.h"
+	"util.h"
+	"vector.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/utf8.c
+	"utf8.h"
+	<assert.h>
+	<stdint.h>
+	<string.h>
+	<strings.h>
+	"error.h"
+	"gumbo.h"
+	"parser.h"
+	"util.h"
+	"vector.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/util.c
+	"util.h"
+	<assert.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	<stdarg.h>
+	<stdio.h>
+	"gumbo.h"
+	"parser.h"
+
+1380152090 source:/home/mingo/dev/SquiLu/gumbo/vector.c
+	"vector.h"
+	<assert.h>
+	<stdlib.h>
+	<string.h>
+	<strings.h>
+	"util.h"
+

+ 802 - 0
gumbo/gumbo.h

@@ -0,0 +1,802 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
+// GUMBO_ as a prefix for enum constants (static constants get the Google-style
+// kGumbo prefix).
+
+/**
+ * @file
+ * @mainpage Gumbo HTML Parser
+ *
+ * This provides a conformant, no-dependencies implementation of the HTML5
+ * parsing algorithm.  It supports only UTF8; if you need to parse a different
+ * encoding, run a preprocessing step to convert to UTF8.  It returns a parse
+ * tree made of the structs in this file.
+ *
+ * Example:
+ * @code
+ *    GumboOutput* output = gumbo_parse(input);
+ *    do_something_with_doctype(output->document);
+ *    do_something_with_html_tree(output->root);
+ *    gumbo_destroy_output(&options, output);
+ * @endcode
+ * HTML5 Spec:
+ *
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
+ */
+
+#ifndef GUMBO_GUMBO_H_
+#define GUMBO_GUMBO_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * A struct representing a character position within the original text buffer.
+ * Line and column numbers are 1-based and offsets are 0-based, which matches
+ * how most editors and command-line tools work.  Also, columns measure
+ * positions in terms of characters while offsets measure by bytes; this is
+ * because the offset field is often used to pull out a particular region of
+ * text (which in most languages that bind to C implies pointer arithmetic on a
+ * buffer of bytes), while the column field is often used to reference a
+ * particular column on a printable display, which nowadays is usually UTF-8.
+ */
+typedef struct {
+  unsigned int line;
+  unsigned int column;
+  unsigned int offset;
+} GumboSourcePosition;
+
+/**
+ * A SourcePosition used for elements that have no source position, i.e.
+ * parser-inserted elements.
+ */
+extern const GumboSourcePosition kGumboEmptySourcePosition;
+
+
+/**
+ * A struct representing a string or part of a string.  Strings within the
+ * parser are represented by a char* and a length; the char* points into
+ * an existing data buffer owned by some other code (often the original input).
+ * GumboStringPieces are assumed (by convention) to be immutable, because they
+ * may share data.  Use GumboStringBuffer if you need to construct a string.
+ * Clients should assume that it is not NUL-terminated, and should always use
+ * explicit lengths when manipulating them.
+ */
+typedef struct {
+  /** A pointer to the beginning of the string.  NULL iff length == 0. */
+  const char* data;
+
+  /** The length of the string fragment, in bytes.  May be zero. */
+  size_t length;
+} GumboStringPiece;
+
+/** A constant to represent a 0-length null string. */
+extern const GumboStringPiece kGumboEmptyString;
+
+/**
+ * Compares two GumboStringPieces, and returns true if they're equal or false
+ * otherwise.
+ */
+bool gumbo_string_equals(
+    const GumboStringPiece* str1, const GumboStringPiece* str2);
+
+/**
+ * Compares two GumboStringPieces ignoring case, and returns true if they're
+ * equal or false otherwise.
+ */
+bool gumbo_string_equals_ignore_case(
+    const GumboStringPiece* str1, const GumboStringPiece* str2);
+
+
+/**
+ * A simple vector implementation.  This stores a pointer to a data array and a
+ * length.  All elements are stored as void*; client code must cast to the
+ * appropriate type.  Overflows upon addition result in reallocation of the data
+ * array, with the size doubling to maintain O(1) amortized cost.  There is no
+ * removal function, as this isn't needed for any of the operations within this
+ * library.  Iteration can be done through inspecting the structure directly in
+ * a for-loop.
+ */
+typedef struct {
+  /** Data elements.  This points to a dynamically-allocated array of capacity
+   * elements, each a void* to the element itself.
+   */
+  void** data;
+
+  /** Number of elements currently in the vector. */
+  unsigned int length;
+
+  /** Current array capacity. */
+  unsigned int capacity;
+} GumboVector;
+
+/** An empty (0-length, 0-capacity) GumboVector. */
+extern const GumboVector kGumboEmptyVector;
+
+/**
+ * Returns the first index at which an element appears in this vector (testing
+ * by pointer equality), or -1 if it never does.
+ */
+int gumbo_vector_index_of(GumboVector* vector, void* element);
+
+
+/**
+ * An enum for all the tags defined in the HTML5 standard.  These correspond to
+ * the tag names themselves.  Enum constants exist only for tags which appear in
+ * the spec itself (or for tags with special handling in the SVG and MathML
+ * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
+ * name can be obtained through original_tag.
+ *
+ * This is mostly for API convenience, so that clients of this library don't
+ * need to perform a strcasecmp to find the normalized tag name.  It also has
+ * efficiency benefits, by letting the parser work with enums instead of
+ * strings.
+ */
+typedef enum {
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
+  GUMBO_TAG_HTML,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
+  GUMBO_TAG_HEAD,
+  GUMBO_TAG_TITLE,
+  GUMBO_TAG_BASE,
+  GUMBO_TAG_LINK,
+  GUMBO_TAG_META,
+  GUMBO_TAG_STYLE,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
+  GUMBO_TAG_SCRIPT,
+  GUMBO_TAG_NOSCRIPT,
+  GUMBO_TAG_TEMPLATE,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
+  GUMBO_TAG_BODY,
+  GUMBO_TAG_ARTICLE,
+  GUMBO_TAG_SECTION,
+  GUMBO_TAG_NAV,
+  GUMBO_TAG_ASIDE,
+  GUMBO_TAG_H1,
+  GUMBO_TAG_H2,
+  GUMBO_TAG_H3,
+  GUMBO_TAG_H4,
+  GUMBO_TAG_H5,
+  GUMBO_TAG_H6,
+  GUMBO_TAG_HGROUP,
+  GUMBO_TAG_HEADER,
+  GUMBO_TAG_FOOTER,
+  GUMBO_TAG_ADDRESS,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
+  GUMBO_TAG_P,
+  GUMBO_TAG_HR,
+  GUMBO_TAG_PRE,
+  GUMBO_TAG_BLOCKQUOTE,
+  GUMBO_TAG_OL,
+  GUMBO_TAG_UL,
+  GUMBO_TAG_LI,
+  GUMBO_TAG_DL,
+  GUMBO_TAG_DT,
+  GUMBO_TAG_DD,
+  GUMBO_TAG_FIGURE,
+  GUMBO_TAG_FIGCAPTION,
+  GUMBO_TAG_MAIN,
+  GUMBO_TAG_DIV,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
+  GUMBO_TAG_A,
+  GUMBO_TAG_EM,
+  GUMBO_TAG_STRONG,
+  GUMBO_TAG_SMALL,
+  GUMBO_TAG_S,
+  GUMBO_TAG_CITE,
+  GUMBO_TAG_Q,
+  GUMBO_TAG_DFN,
+  GUMBO_TAG_ABBR,
+  GUMBO_TAG_DATA,
+  GUMBO_TAG_TIME,
+  GUMBO_TAG_CODE,
+  GUMBO_TAG_VAR,
+  GUMBO_TAG_SAMP,
+  GUMBO_TAG_KBD,
+  GUMBO_TAG_SUB,
+  GUMBO_TAG_SUP,
+  GUMBO_TAG_I,
+  GUMBO_TAG_B,
+  GUMBO_TAG_U,
+  GUMBO_TAG_MARK,
+  GUMBO_TAG_RUBY,
+  GUMBO_TAG_RT,
+  GUMBO_TAG_RP,
+  GUMBO_TAG_BDI,
+  GUMBO_TAG_BDO,
+  GUMBO_TAG_SPAN,
+  GUMBO_TAG_BR,
+  GUMBO_TAG_WBR,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
+  GUMBO_TAG_INS,
+  GUMBO_TAG_DEL,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
+  GUMBO_TAG_IMAGE,
+  GUMBO_TAG_IMG,
+  GUMBO_TAG_IFRAME,
+  GUMBO_TAG_EMBED,
+  GUMBO_TAG_OBJECT,
+  GUMBO_TAG_PARAM,
+  GUMBO_TAG_VIDEO,
+  GUMBO_TAG_AUDIO,
+  GUMBO_TAG_SOURCE,
+  GUMBO_TAG_TRACK,
+  GUMBO_TAG_CANVAS,
+  GUMBO_TAG_MAP,
+  GUMBO_TAG_AREA,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
+  GUMBO_TAG_MATH,
+  GUMBO_TAG_MI,
+  GUMBO_TAG_MO,
+  GUMBO_TAG_MN,
+  GUMBO_TAG_MS,
+  GUMBO_TAG_MTEXT,
+  GUMBO_TAG_MGLYPH,
+  GUMBO_TAG_MALIGNMARK,
+  GUMBO_TAG_ANNOTATION_XML,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
+  GUMBO_TAG_SVG,
+  GUMBO_TAG_FOREIGNOBJECT,
+  GUMBO_TAG_DESC,
+  // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
+  GUMBO_TAG_TABLE,
+  GUMBO_TAG_CAPTION,
+  GUMBO_TAG_COLGROUP,
+  GUMBO_TAG_COL,
+  GUMBO_TAG_TBODY,
+  GUMBO_TAG_THEAD,
+  GUMBO_TAG_TFOOT,
+  GUMBO_TAG_TR,
+  GUMBO_TAG_TD,
+  GUMBO_TAG_TH,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
+  GUMBO_TAG_FORM,
+  GUMBO_TAG_FIELDSET,
+  GUMBO_TAG_LEGEND,
+  GUMBO_TAG_LABEL,
+  GUMBO_TAG_INPUT,
+  GUMBO_TAG_BUTTON,
+  GUMBO_TAG_SELECT,
+  GUMBO_TAG_DATALIST,
+  GUMBO_TAG_OPTGROUP,
+  GUMBO_TAG_OPTION,
+  GUMBO_TAG_TEXTAREA,
+  GUMBO_TAG_KEYGEN,
+  GUMBO_TAG_OUTPUT,
+  GUMBO_TAG_PROGRESS,
+  GUMBO_TAG_METER,
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
+  GUMBO_TAG_DETAILS,
+  GUMBO_TAG_SUMMARY,
+  GUMBO_TAG_MENU,
+  GUMBO_TAG_MENUITEM,
+  // Non-conforming elements that nonetheless appear in the HTML5 spec.
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
+  GUMBO_TAG_APPLET,
+  GUMBO_TAG_ACRONYM,
+  GUMBO_TAG_BGSOUND,
+  GUMBO_TAG_DIR,
+  GUMBO_TAG_FRAME,
+  GUMBO_TAG_FRAMESET,
+  GUMBO_TAG_NOFRAMES,
+  GUMBO_TAG_ISINDEX,
+  GUMBO_TAG_LISTING,
+  GUMBO_TAG_XMP,
+  GUMBO_TAG_NEXTID,
+  GUMBO_TAG_NOEMBED,
+  GUMBO_TAG_PLAINTEXT,
+  GUMBO_TAG_RB,
+  GUMBO_TAG_STRIKE,
+  GUMBO_TAG_BASEFONT,
+  GUMBO_TAG_BIG,
+  GUMBO_TAG_BLINK,
+  GUMBO_TAG_CENTER,
+  GUMBO_TAG_FONT,
+  GUMBO_TAG_MARQUEE,
+  GUMBO_TAG_MULTICOL,
+  GUMBO_TAG_NOBR,
+  GUMBO_TAG_SPACER,
+  GUMBO_TAG_TT,
+  // Used for all tags that don't have special handling in HTML.
+  GUMBO_TAG_UNKNOWN,
+  // A marker value to indicate the end of the enum, for iterating over it.
+  // Also used as the terminator for varargs functions that take tags.
+  GUMBO_TAG_LAST,
+} GumboTag;
+
+/**
+ * Returns the normalized (usually all-lowercased, except for foreign content)
+ * tag name for an GumboTag enum.  Return value is static data owned by the
+ * library.
+ */
+const char* gumbo_normalized_tagname(GumboTag tag);
+
+/**
+ * Extracts the tag name from the original_text field of an element or token by
+ * stripping off </> characters and attributes and adjusting the passed-in
+ * GumboStringPiece appropriately.  The tag name is in the original case and
+ * shares a buffer with the original text, to simplify memory management.
+ * Behavior is undefined if a string-piece that doesn't represent an HTML tag
+ * (<tagname> or </tagname>) is passed in.  If the string piece is completely
+ * empty (NULL data pointer), then this function will exit successfully as a
+ * no-op.
+ */
+void gumbo_tag_from_original_text(GumboStringPiece* text);
+
+/**
+ * Fixes the case of SVG elements that are not all lowercase.
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
+ * This is not done at parse time because there's no place to store a mutated
+ * tag name.  tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
+ * without special handling), while original_tag_name is a pointer into the
+ * original buffer.  Instead, we provide this helper function that clients can
+ * use to rename SVG tags as appropriate.
+ * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
+ * no normalization is called for.  The return value is static data and owned by
+ * the library.
+ */
+const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
+
+/**
+ * Converts a tag name string (which may be in upper or mixed case) to a tag
+ * enum.
+ */
+GumboTag gumbo_tag_enum(const char* tagname);
+
+/**
+ * Attribute namespaces.
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces on
+ * attributes.  Everything else goes in the generatic "NONE" namespace.
+ */
+typedef enum {
+  GUMBO_ATTR_NAMESPACE_NONE,
+  GUMBO_ATTR_NAMESPACE_XLINK,
+  GUMBO_ATTR_NAMESPACE_XML,
+  GUMBO_ATTR_NAMESPACE_XMLNS,
+} GumboAttributeNamespaceEnum;
+
+/**
+ * A struct representing a single attribute on an HTML tag.  This is a
+ * name-value pair, but also includes information about source locations and
+ * original source text.
+ */
+typedef struct {
+  /**
+   * The namespace for the attribute.  This will usually be
+   * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
+   * values, per:
+   * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
+   */
+  GumboAttributeNamespaceEnum attr_namespace;
+
+  /**
+   * The name of the attribute.  This is in a freshly-allocated buffer to deal
+   * with case-normalization, and is null-terminated.
+   */
+  const char* name;
+
+  /**
+   * The original text of the attribute name, as a pointer into the original
+   * source buffer.
+   */
+  GumboStringPiece original_name;
+
+  /**
+   * The value of the attribute.  This is in a freshly-allocated buffer to deal
+   * with unescaping, and is null-terminated.  It does not include any quotes
+   * that surround the attribute.  If the attribute has no value (for example,
+   * 'selected' on a checkbox), this will be an empty string.
+   */
+  const char* value;
+
+  /**
+   * The original text of the value of the attribute.  This points into the
+   * original source buffer.  It includes any quotes that surround the
+   * attribute, and you can look at original_value.data[0] and
+   * original_value.data[original_value.length - 1] to determine what the quote
+   * characters were.  If the attribute has no value, this will be a 0-length
+   * string.
+   */
+  GumboStringPiece original_value;
+
+  /** The starting position of the attribute name. */
+  GumboSourcePosition name_start;
+
+  /**
+   * The ending position of the attribute name.  This is not always derivable
+   * from the starting position of the value because of the possibility of
+   * whitespace around the = sign.
+   */
+  GumboSourcePosition name_end;
+
+  /** The starting position of the attribute value. */
+  GumboSourcePosition value_start;
+
+  /** The ending position of the attribute value. */
+  GumboSourcePosition value_end;
+} GumboAttribute;
+
+/**
+ * Given a vector of GumboAttributes, look up the one with the specified name
+ * and return it, or NULL if no such attribute exists.  This uses a
+ * case-insensitive match, as HTML is case-insensitive.
+ */
+GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
+
+/**
+ * Enum denoting the type of node.  This determines the type of the node.v
+ * union.
+ */
+typedef enum {
+  /** Document node.  v will be a GumboDocument. */
+  GUMBO_NODE_DOCUMENT,
+  /** Element node.  v will be a GumboElement. */
+  GUMBO_NODE_ELEMENT,
+  /** Text node.  v will be a GumboText. */
+  GUMBO_NODE_TEXT,
+  /** CDATA node. v will be a GumboText. */
+  GUMBO_NODE_CDATA,
+  /** Comment node.  v. will be a GumboText, excluding comment delimiters. */
+  GUMBO_NODE_COMMENT,
+  /** Text node, where all contents is whitespace.  v will be a GumboText. */
+  GUMBO_NODE_WHITESPACE
+} GumboNodeType;
+
+/**
+ * Forward declaration of GumboNode so it can be used recursively in
+ * GumboNode.parent.
+ */
+typedef struct GumboInternalNode GumboNode;
+
+/** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
+typedef enum {
+  GUMBO_DOCTYPE_NO_QUIRKS,
+  GUMBO_DOCTYPE_QUIRKS,
+  GUMBO_DOCTYPE_LIMITED_QUIRKS
+} GumboQuirksModeEnum;
+
+/**
+ * Namespaces.
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.  Rather,
+ * anything inside an <svg> tag is in the SVG namespace, anything inside the
+ * <math> tag is in the MathML namespace, and anything else is inside the HTML
+ * namespace.  No other namespaces are supported, so this can be an enum only.
+ */
+typedef enum {
+  GUMBO_NAMESPACE_HTML,
+  GUMBO_NAMESPACE_SVG,
+  GUMBO_NAMESPACE_MATHML
+} GumboNamespaceEnum;
+
+/**
+ * Parse flags.
+ * We track the reasons for parser insertion of nodes and store them in a
+ * bitvector in the node itself.  This lets client code optimize out nodes that
+ * are implied by the HTML structure of the document, or flag constructs that
+ * may not be allowed by a style guide, or track the prevalence of incorrect or
+ * tricky HTML code.
+ */
+typedef enum {
+  /**
+   * A normal node - both start and end tags appear in the source, nothing has
+   * been reparented.
+   */
+  GUMBO_INSERTION_NORMAL = 0,
+
+  /**
+   * A node inserted by the parser to fulfill some implicit insertion rule.
+   * This is usually set in addition to some other flag giving a more specific
+   * insertion reason; it's a generic catch-all term meaning "The start tag for
+   * this node did not appear in the document source".
+   */
+  GUMBO_INSERTION_BY_PARSER = 1 << 0,
+
+  /**
+   * A flag indicating that the end tag for this node did not appear in the
+   * document source.  Note that in some cases, you can still have
+   * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
+   * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
+   * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
+   * exists.  This flag will be set only if the end tag is completely missing;
+   * in some cases, the end tag may be misplaced (eg. a </body> tag with text
+   * afterwards), which will leave this flag unset and require clients to
+   * inspect the parse errors for that case.
+   */
+  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
+
+  // Value 1 << 2 was for a flag that has since been removed.
+
+  /**
+   * A flag for nodes that are inserted because their presence is implied by
+   * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
+   */
+  GUMBO_INSERTION_IMPLIED = 1 << 3,
+
+  /**
+   * A flag for nodes that are converted from their end tag equivalents.  For
+   * example, </p> when no paragraph is open implies that the parser should
+   * create a <p> tag and immediately close it, while </br> means the same thing
+   * as <br>.
+   */
+  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
+
+  /** A flag for nodes that are converted from the parse of an <isindex> tag. */
+  GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
+
+  /** A flag for <image> tags that are rewritten as <img>. */
+  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
+
+  /**
+   * A flag for nodes that are cloned as a result of the reconstruction of
+   * active formatting elements.  This is set only on the clone; the initial
+   * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
+   */
+  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
+
+  /** A flag for nodes that are cloned by the adoption agency algorithm. */
+  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
+
+  /** A flag for nodes that are moved by the adoption agency algorithm. */
+  GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
+
+  /**
+   * A flag for nodes that have been foster-parented out of a table (or
+   * should've been foster-parented, if verbatim mode is set).
+   */
+  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
+} GumboParseFlags;
+
+
+/**
+ * Information specific to document nodes.
+ */
+typedef struct {
+  /**
+   * An array of GumboNodes, containing the children of this element.  This will
+   * normally consist of the <html> element and any comment nodes found.
+   * Pointers are owned.
+   */
+  GumboVector /* GumboNode* */ children;
+
+  // True if there was an explicit doctype token as opposed to it being omitted.
+  bool has_doctype;
+
+  // Fields from the doctype token, copied verbatim.
+  const char* name;
+  const char* public_identifier;
+  const char* system_identifier;
+
+  /**
+   * Whether or not the document is in QuirksMode, as determined by the values
+   * in the GumboTokenDocType template.
+   */
+  GumboQuirksModeEnum doc_type_quirks_mode;
+} GumboDocument;
+
+/**
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
+ * This contains just a block of text and its position.
+ */
+typedef struct {
+  /**
+   * The text of this node, after entities have been parsed and decoded.  For
+   * comment/cdata nodes, this does not include the comment delimiters.
+   */
+  const char* text;
+
+  /**
+   * The original text of this node, as a pointer into the original buffer.  For
+   * comment/cdata nodes, this includes the comment delimiters.
+   */
+  GumboStringPiece original_text;
+
+  /**
+   * The starting position of this node.  This corresponds to the position of
+   * original_text, before entities are decoded.
+   * */
+  GumboSourcePosition start_pos;
+} GumboText;
+
+/**
+ * The struct used to represent all HTML elements.  This contains information
+ * about the tag, attributes, and child nodes.
+ */
+typedef struct {
+  /**
+   * An array of GumboNodes, containing the children of this element.  Pointers
+   * are owned.
+   */
+  GumboVector /* GumboNode* */ children;
+
+  /** The GumboTag enum for this element. */
+  GumboTag tag;
+
+  /** The GumboNamespaceEnum for this element. */
+  GumboNamespaceEnum tag_namespace;
+
+  /**
+   * A GumboStringPiece pointing to the original tag text for this element,
+   * pointing directly into the source buffer.  If the tag was inserted
+   * algorithmically (for example, <head> or <tbody> insertion), this will be a
+   * zero-length string.
+   */
+  GumboStringPiece original_tag;
+
+  /**
+   * A GumboStringPiece pointing to the original end tag text for this element.
+   * If the end tag was inserted algorithmically, (for example, closing a
+   * self-closing tag), this will be a zero-length string.
+   */
+  GumboStringPiece original_end_tag;
+
+  /** The source position for the start of the start tag. */
+  GumboSourcePosition start_pos;
+
+  /** The source position for the start of the end tag. */
+  GumboSourcePosition end_pos;
+
+  /**
+   * An array of GumboAttributes, containing the attributes for this tag in the
+   * order that they were parsed.  Pointers are owned.
+   */
+  GumboVector /* GumboAttribute* */ attributes;
+} GumboElement;
+
+/**
+ * A supertype for GumboElement and GumboText, so that we can include one
+ * generic type in lists of children and cast as necessary to subtypes.
+ */
+struct GumboInternalNode {
+  /** The type of node that this is. */
+  GumboNodeType type;
+
+  /** Pointer back to parent node.  Not owned. */
+  GumboNode* parent;
+
+  /** The index within the parent's children vector of this node. */
+  size_t index_within_parent;
+
+  /**
+   * A bitvector of flags containing information about why this element was
+   * inserted into the parse tree, including a variety of special parse
+   * situations.
+   */
+  GumboParseFlags parse_flags;
+
+  /** The actual node data. */
+  union {
+    GumboDocument document;      // For GUMBO_NODE_DOCUMENT.
+    GumboElement element;        // For GUMBO_NODE_ELEMENT.
+    GumboText text;              // For everything else.
+  } v;
+};
+
+/**
+ * The type for an allocator function.  Takes the 'userdata' member of the
+ * GumboParser struct as its first argument.  Semantics should be the same as
+ * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
+ * Allocating a block of 0 bytes behaves as per malloc.
+ */
+// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
+typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
+
+/**
+ * The type for a deallocator function.  Takes the 'userdata' member of the
+ * GumboParser struct as its first argument.
+ */
+typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
+
+/**
+ * Input struct containing configuration options for the parser.
+ * These let you specify alternate memory managers, provide different error
+ * handling, etc.
+ * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
+ */
+typedef struct GumboInternalOptions {
+  /** A memory allocator function.  Default: malloc. */
+  GumboAllocatorFunction allocator;
+
+  /** A memory deallocator function. Default: free. */
+  GumboDeallocatorFunction deallocator;
+
+  /**
+   * An opaque object that's passed in as the first argument to all callbacks
+   * used by this library.  Default: NULL.
+   */
+  void* userdata;
+
+  /**
+   * The tab-stop size, for computing positions in source code that uses tabs.
+   * Default: 8.
+   */
+  int tab_stop;
+
+  /**
+   * Whether or not to stop parsing when the first error is encountered.
+   * Default: false.
+   */
+  bool stop_on_first_error;
+
+  /**
+   * The maximum number of errors before the parser stops recording them.  This
+   * is provided so that if the page is totally borked, we don't completely fill
+   * up the errors vector and exhaust memory with useless redundant errors.  Set
+   * to -1 to disable the limit.
+   * Default: -1
+   */
+  int max_errors;
+} GumboOptions;
+
+/** Default options struct; use this with gumbo_parse_with_options. */
+extern const GumboOptions kGumboDefaultOptions;
+
+/** The output struct containing the results of the parse. */
+typedef struct GumboInternalOutput {
+  /**
+   * Pointer to the document node.  This is a GumboNode of type NODE_DOCUMENT
+   * that contains the entire document as its child.
+   */
+  GumboNode* document;
+
+  /**
+   * Pointer to the root node.  This the <html> tag that forms the root of the
+   * document.
+   */
+  GumboNode* root;
+
+  /**
+   * A list of errors that occurred during the parse.
+   * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
+   * fleshed out and may change in the future.  For this reason, the GumboError
+   * header isn't part of the public API.  Contact us if you need errors
+   * reported so we can work out something appropriate for your use-case.
+   */
+  GumboVector /* GumboError */ errors;
+} GumboOutput;
+
+/**
+ * Parses a buffer of UTF8 text into an GumboNode parse tree.  The buffer must
+ * live at least as long as the parse tree, as some fields (eg. original_text)
+ * point directly into the original buffer.
+ *
+ * This doesn't support buffers longer than 4 gigabytes.
+ */
+GumboOutput* gumbo_parse(const char* buffer);
+
+/**
+ * Extended version of gumbo_parse that takes an explicit options structure,
+ * buffer, and length.
+ */
+GumboOutput* gumbo_parse_with_options(
+    const GumboOptions* options, const char* buffer, size_t buffer_length);
+
+/** Release the memory used for the parse tree & parse errors. */
+void gumbo_destroy_output(
+    const GumboOptions* options, GumboOutput* output);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_GUMBO_H_

+ 57 - 0
gumbo/insertion_mode.h

@@ -0,0 +1,57 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#ifndef GUMBO_INSERTION_MODE_H_
+#define GUMBO_INSERTION_MODE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
+// If new enum values are added, be sure to update the kTokenHandlers dispatch
+// table in parser.c.
+typedef enum {
+  GUMBO_INSERTION_MODE_INITIAL,
+  GUMBO_INSERTION_MODE_BEFORE_HTML,
+  GUMBO_INSERTION_MODE_BEFORE_HEAD,
+  GUMBO_INSERTION_MODE_IN_HEAD,
+  GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
+  GUMBO_INSERTION_MODE_AFTER_HEAD,
+  GUMBO_INSERTION_MODE_IN_BODY,
+  GUMBO_INSERTION_MODE_TEXT,
+  GUMBO_INSERTION_MODE_IN_TABLE,
+  GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
+  GUMBO_INSERTION_MODE_IN_CAPTION,
+  GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
+  GUMBO_INSERTION_MODE_IN_TABLE_BODY,
+  GUMBO_INSERTION_MODE_IN_ROW,
+  GUMBO_INSERTION_MODE_IN_CELL,
+  GUMBO_INSERTION_MODE_IN_SELECT,
+  GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
+  GUMBO_INSERTION_MODE_IN_TEMPLATE,
+  GUMBO_INSERTION_MODE_AFTER_BODY,
+  GUMBO_INSERTION_MODE_IN_FRAMESET,
+  GUMBO_INSERTION_MODE_AFTER_FRAMESET,
+  GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
+  GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
+} GumboInsertionMode;
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+
+#endif  // GUMBO_INSERTION_MODE_H_

+ 3925 - 0
gumbo/parser.c

@@ -0,0 +1,3925 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "attribute.h"
+#include "error.h"
+#include "gumbo.h"
+#include "insertion_mode.h"
+#include "parser.h"
+#include "tokenizer.h"
+#include "tokenizer_states.h"
+#include "utf8.h"
+#include "util.h"
+#include "vector.h"
+
+
+#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
+
+#define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
+#define TERMINATOR { "", 0 }
+
+static void* malloc_wrapper(void* unused, size_t size) {
+  return malloc(size);
+}
+
+static void free_wrapper(void* unused, void* ptr) {
+  return free(ptr);
+}
+
+const GumboOptions kGumboDefaultOptions = {
+  &malloc_wrapper,
+  &free_wrapper,
+  NULL,
+  8,
+  false,
+  -1,
+};
+
+static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
+static const GumboStringPiece kPublicIdHtml4_0 = GUMBO_STRING(
+    "-//W3C//DTD HTML 4.0//EN");
+static const GumboStringPiece kPublicIdHtml4_01 = GUMBO_STRING(
+    "-//W3C//DTD HTML 4.01//EN");
+static const GumboStringPiece kPublicIdXhtml1_0 = GUMBO_STRING(
+    "-//W3C//DTD XHTML 1.0 Strict//EN");
+static const GumboStringPiece kPublicIdXhtml1_1 = GUMBO_STRING(
+    "-//W3C//DTD XHTML 1.1//EN");
+static const GumboStringPiece kSystemIdRecHtml4_0 = GUMBO_STRING(
+    "http://www.w3.org/TR/REC-html40/strict.dtd");
+static const GumboStringPiece kSystemIdHtml4 = GUMBO_STRING(
+    "http://www.w3.org/TR/html4/strict.dtd");
+static const GumboStringPiece kSystemIdXhtmlStrict1_1 = GUMBO_STRING(
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
+static const GumboStringPiece kSystemIdXhtml1_1 = GUMBO_STRING(
+    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
+static const GumboStringPiece kSystemIdLegacyCompat = GUMBO_STRING(
+    "about:legacy-compat");
+
+// The doctype arrays have an explicit terminator because we want to pass them
+// to a helper function, and passing them as a pointer discards sizeof
+// information.  The SVG arrays are used only by one-off functions, and so loops
+// over them use sizeof directly instead of a terminator.
+
+static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
+  GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
+  GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
+  GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
+  GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
+  GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
+  GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
+  GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
+  GUMBO_STRING("-//IETF//DTD HTML 3//"),
+  GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
+  GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
+  GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
+  GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
+  GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
+  GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
+  GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
+  GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
+  GUMBO_STRING("-//IETF//DTD HTML Strict//"),
+  GUMBO_STRING("-//IETF//DTD HTML//"),
+  GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
+  GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
+  GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
+  GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
+  GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
+  GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
+  GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
+  GUMBO_STRING("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
+      "extensions to HTML 4.0//"),
+  GUMBO_STRING("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
+      "extensions to HTML 4.0//"),
+  GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
+  GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
+  GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
+  GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
+  GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
+  GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
+  GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
+  GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
+  GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
+  GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
+  GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
+  GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
+  GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
+  GUMBO_STRING("-//W3C//DTD W3 HTML//"),
+  GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
+  GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
+  GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"),
+  TERMINATOR
+};
+
+static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
+  GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
+  GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
+  GUMBO_STRING("HTML"),
+  TERMINATOR
+};
+
+static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
+  GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
+  TERMINATOR
+};
+
+static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
+  GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
+  GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
+  TERMINATOR
+};
+
+static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = {
+  GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
+  GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"),
+  TERMINATOR
+};
+
+// Indexed by GumboNamespaceEnum; keep in sync with that.
+static const char* kLegalXmlns[] = {
+  "http://www.w3.org/1999/xhtml",
+  "http://www.w3.org/2000/svg",
+  "http://www.w3.org/1998/Math/MathML"
+};
+
+typedef struct _ReplacementEntry {
+  const GumboStringPiece from;
+  const GumboStringPiece to;
+} ReplacementEntry;
+
+#define REPLACEMENT_ENTRY(from, to) \
+    { GUMBO_STRING(from), GUMBO_STRING(to) }
+
+// Static data for SVG attribute replacements.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
+static const ReplacementEntry kSvgAttributeReplacements[] = {
+  REPLACEMENT_ENTRY("attributename", "attributeName"),
+  REPLACEMENT_ENTRY("attributetype", "attributeType"),
+  REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
+  REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
+  REPLACEMENT_ENTRY("calcmode", "calcMode"),
+  REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
+  REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
+  REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
+  REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
+  REPLACEMENT_ENTRY("edgemode", "edgeMode"),
+  REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
+  REPLACEMENT_ENTRY("filterres", "filterRes"),
+  REPLACEMENT_ENTRY("filterunits", "filterUnits"),
+  REPLACEMENT_ENTRY("glyphref", "glyphRef"),
+  REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
+  REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
+  REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
+  REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
+  REPLACEMENT_ENTRY("keypoints", "keyPoints"),
+  REPLACEMENT_ENTRY("keysplines", "keySplines"),
+  REPLACEMENT_ENTRY("keytimes", "keyTimes"),
+  REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
+  REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
+  REPLACEMENT_ENTRY("markerheight", "markerHeight"),
+  REPLACEMENT_ENTRY("markerunits", "markerUnits"),
+  REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
+  REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
+  REPLACEMENT_ENTRY("maskunits", "maskUnits"),
+  REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
+  REPLACEMENT_ENTRY("pathlength", "pathLength"),
+  REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
+  REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
+  REPLACEMENT_ENTRY("patternunits", "patternUnits"),
+  REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
+  REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
+  REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
+  REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
+  REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
+  REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
+  REPLACEMENT_ENTRY("refx", "refX"),
+  REPLACEMENT_ENTRY("refy", "refY"),
+  REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
+  REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
+  REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
+  REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
+  REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
+  REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
+  REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
+  REPLACEMENT_ENTRY("startoffset", "startOffset"),
+  REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
+  REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
+  REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
+  REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
+  REPLACEMENT_ENTRY("tablevalues", "tableValues"),
+  REPLACEMENT_ENTRY("targetx", "targetX"),
+  REPLACEMENT_ENTRY("targety", "targetY"),
+  REPLACEMENT_ENTRY("textlength", "textLength"),
+  REPLACEMENT_ENTRY("viewbox", "viewBox"),
+  REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
+  REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
+  REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
+  REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
+};
+
+static const ReplacementEntry kSvgTagReplacements[] = {
+  REPLACEMENT_ENTRY("altglyph", "altGlyph"),
+  REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
+  REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
+  REPLACEMENT_ENTRY("animatecolor", "animateColor"),
+  REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
+  REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
+  REPLACEMENT_ENTRY("clippath", "clipPath"),
+  REPLACEMENT_ENTRY("feblend", "feBlend"),
+  REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
+  REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
+  REPLACEMENT_ENTRY("fecomposite", "feComposite"),
+  REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
+  REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
+  REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
+  REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
+  REPLACEMENT_ENTRY("feflood", "feFlood"),
+  REPLACEMENT_ENTRY("fefunca", "feFuncA"),
+  REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
+  REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
+  REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
+  REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
+  REPLACEMENT_ENTRY("feimage", "feImage"),
+  REPLACEMENT_ENTRY("femerge", "feMerge"),
+  REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
+  REPLACEMENT_ENTRY("femorphology", "feMorphology"),
+  REPLACEMENT_ENTRY("feoffset", "feOffset"),
+  REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
+  REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
+  REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
+  REPLACEMENT_ENTRY("fetile", "feTile"),
+  REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
+  REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
+  REPLACEMENT_ENTRY("glyphref", "glyphRef"),
+  REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
+  REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
+  REPLACEMENT_ENTRY("textpath", "textPath"),
+};
+
+typedef struct _NamespacedAttributeReplacement {
+  const char* from;
+  const char* local_name;
+  const GumboAttributeNamespaceEnum attr_namespace;
+} NamespacedAttributeReplacement;
+
+static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
+  { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK },
+  { "xml:base", "base", GUMBO_ATTR_NAMESPACE_XML },
+  { "xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML },
+  { "xml:space", "space", GUMBO_ATTR_NAMESPACE_XML },
+  { "xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS },
+  { "xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS },
+};
+
+// The "scope marker" for the list of active formatting elements.  We use a
+// pointer to this as a generic marker element, since the particular element
+// scope doesn't matter.
+static const GumboNode kActiveFormattingScopeMarker;
+
+// The tag_is and tag_in function use true & false to denote start & end tags,
+// but for readability, we define constants for them here.
+static const bool kStartTag = true;
+static const bool kEndTag = false;
+
+// Because GumboStringPieces are immutable, we can't insert a character directly
+// into a text node.  Instead, we accumulate all pending characters here and
+// flush them out to a text node whenever a new element is inserted.
+//
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
+typedef struct _TextNodeBufferState {
+  // The accumulated text to be inserted into the current text node.
+  GumboStringBuffer _buffer;
+
+  // A pointer to the original text represented by this text node.  Note that
+  // because of foster parenting and other strange DOM manipulations, this may
+  // include other non-text HTML tags in it; it is defined as the span of
+  // original text from the first character in this text node to the last
+  // character in this text node.
+  const char* _start_original_text;
+
+  // The source position of the start of this text node.
+  GumboSourcePosition _start_position;
+
+  // The type of node that will be inserted (TEXT or WHITESPACE).
+  GumboNodeType _type;
+} TextNodeBufferState;
+
+typedef struct GumboInternalParserState {
+  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
+  GumboInsertionMode _insertion_mode;
+
+  // Used for run_generic_parsing_algorithm, which needs to switch back to the
+  // original insertion mode at its conclusion.
+  GumboInsertionMode _original_insertion_mode;
+
+  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
+  GumboVector /*GumboNode*/ _open_elements;
+
+  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
+  GumboVector /*GumboNode*/ _active_formatting_elements;
+
+  // The stack of template insertion modes.
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
+  GumboVector /*InsertionMode*/ _template_insertion_modes;
+
+  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
+  GumboNode* _head_element;
+  GumboNode* _form_element;
+
+  // The flag for when the spec says "Reprocess the current token in..."
+  bool _reprocess_current_token;
+
+  // The flag for "acknowledge the token's self-closing flag".
+  bool _self_closing_flag_acknowledged;
+
+  // The "frameset-ok" flag from the spec.
+  bool _frameset_ok;
+
+  // The flag for "If the next token is a LINE FEED, ignore that token...".
+  bool _ignore_next_linefeed;
+
+  // The flag for "whenever a node would be inserted into the current node, it
+  // must instead be foster parented".  This is used for misnested table
+  // content, which needs to be handled according to "in body" rules yet foster
+  // parented outside of the table.
+  // It would perhaps be more explicit to have this as a parameter to
+  // handle_in_body and insert_element, but given how special-purpose this is
+  // and the number of call-sites that would need to take the extra parameter,
+  // it's easier just to have a state flag.
+  bool _foster_parent_insertions;
+
+  // The accumulated text node buffer state.
+  TextNodeBufferState _text_node;
+
+  // The current token.
+  GumboToken* _current_token;
+
+  // The way that the spec is written, the </body> and </html> tags are *always*
+  // implicit, because encountering one of those tokens merely switches the
+  // insertion mode out of "in body".  So we have individual state flags for
+  // those end tags that are then inspected by pop_current_node when the <body>
+  // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
+  // flag appropriately.
+  bool _closed_body_tag;
+  bool _closed_html_tag;
+} GumboParserState;
+
+static bool token_has_attribute(const GumboToken* token, const char* name) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
+}
+
+// Checks if the value of the specified attribute is a case-insensitive match
+// for the specified string.
+static bool attribute_matches(
+    const GumboVector* attributes, const char* name, const char* value) {
+  const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
+  return attr ? strcasecmp(value, attr->value) == 0 : false;
+}
+
+// Checks if the value of the specified attribute is a case-sensitive match
+// for the specified string.
+static bool attribute_matches_case_sensitive(
+    const GumboVector* attributes, const char* name, const char* value) {
+  const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
+  return attr ?  strcmp(value, attr->value) == 0 : false;
+}
+
+// Checks if the specified attribute vectors are identical.
+static bool all_attributes_match(
+    const GumboVector* attr1, const GumboVector* attr2) {
+  int num_unmatched_attr2_elements = attr2->length;
+  for (int i = 0; i < attr1->length; ++i) {
+    const GumboAttribute* attr = attr1->data[i];
+    if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
+      --num_unmatched_attr2_elements;
+    } else {
+      return false;
+    }
+  }
+  return num_unmatched_attr2_elements == 0;
+}
+
+static void set_frameset_not_ok(GumboParser* parser) {
+  gumbo_debug("Setting frameset_ok to false.\n");
+  parser->_parser_state->_frameset_ok = false;
+}
+
+static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
+  GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
+  node->parent = NULL;
+  node->index_within_parent = -1;
+  node->type = type;
+  node->parse_flags = GUMBO_INSERTION_NORMAL;
+  return node;
+}
+
+static GumboNode* new_document_node(GumboParser* parser) {
+  GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
+  document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
+  gumbo_vector_init(
+      parser, 1, &document_node->v.document.children);
+
+  // Must be initialized explicitly, as there's no guarantee that we'll see a
+  // doc type token.
+  GumboDocument* document = &document_node->v.document;
+  document->has_doctype = false;
+  document->name = NULL;
+  document->public_identifier = NULL;
+  document->system_identifier = NULL;
+  return document_node;
+}
+
+static void output_init(GumboParser* parser) {
+  GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
+  output->root = NULL;
+  output->document = new_document_node(parser);
+  parser->_output = output;
+  gumbo_init_errors(parser);
+}
+
+static void parser_state_init(GumboParser* parser) {
+  GumboParserState* parser_state =
+      gumbo_parser_allocate(parser, sizeof(GumboParserState));
+  parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
+  parser_state->_reprocess_current_token = false;
+  parser_state->_frameset_ok = true;
+  parser_state->_ignore_next_linefeed = false;
+  parser_state->_foster_parent_insertions = false;
+  parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
+  gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
+  gumbo_vector_init(parser, 10, &parser_state->_open_elements);
+  gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
+  gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
+  parser_state->_head_element = NULL;
+  parser_state->_form_element = NULL;
+  parser_state->_current_token = NULL;
+  parser_state->_closed_body_tag = false;
+  parser_state->_closed_html_tag = false;
+  parser->_parser_state = parser_state;
+}
+
+static void parser_state_destroy(GumboParser* parser) {
+  GumboParserState* state = parser->_parser_state;
+  gumbo_vector_destroy(parser, &state->_active_formatting_elements);
+  gumbo_vector_destroy(parser, &state->_open_elements);
+  gumbo_vector_destroy(parser, &state->_template_insertion_modes);
+  gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
+  gumbo_parser_deallocate(parser, state);
+}
+
+static GumboNode* get_document_node(GumboParser* parser) {
+  return parser->_output->document;
+}
+
+// Returns the node at the bottom of the stack of open elements, or NULL if no
+// elements have been added yet.
+static GumboNode* get_current_node(GumboParser* parser) {
+  GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  if (open_elements->length == 0) {
+    assert(!parser->_output->root);
+    return NULL;
+  }
+  assert(open_elements->length > 0);
+  assert(open_elements->data != NULL);
+  return open_elements->data[open_elements->length - 1];
+}
+
+// Returns true if the given needle is in the given array of literal
+// GumboStringPieces.  If exact_match is true, this requires that they match
+// exactly; otherwise, this performs a prefix match to check if any of the
+// elements in haystack start with needle.  This always performs a
+// case-insensitive match.
+static bool is_in_static_list(
+    const char* needle, const GumboStringPiece* haystack, bool exact_match) {
+  for (int i = 0; haystack[i].length > 0; ++i) {
+    if ((exact_match && !strcmp(needle, haystack[i].data)) ||
+        (!exact_match && !strcasecmp(needle, haystack[i].data))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void push_template_insertion_mode(
+    GumboParser* parser, GumboInsertionMode mode) {
+  gumbo_vector_add(
+      parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
+}
+
+static void pop_template_insertion_mode(GumboParser* parser) {
+  gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
+}
+
+static GumboInsertionMode get_current_template_insertion_mode(
+    GumboParser* parser) {
+  GumboVector* template_insertion_modes =
+      &parser->_parser_state->_template_insertion_modes;
+  assert(template_insertion_modes->length > 0);
+  return (GumboInsertionMode) template_insertion_modes->data[
+      template_insertion_modes->length - 1];
+}
+
+static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
+  parser->_parser_state->_insertion_mode = mode;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
+// This is a helper function that returns the appropriate insertion mode instead
+// of setting it.  Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
+// indicate that there is no appropriate insertion mode, and the loop should
+// continue.
+static GumboInsertionMode get_appropriate_insertion_mode(
+    const GumboNode* node, bool is_last) {
+  assert(node->type == GUMBO_NODE_ELEMENT);
+  switch (node->v.element.tag) {
+    case GUMBO_TAG_SELECT:
+      return GUMBO_INSERTION_MODE_IN_SELECT;
+    case GUMBO_TAG_TD:
+    case GUMBO_TAG_TH:
+      return is_last ?
+          GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
+    case GUMBO_TAG_TR:
+      return GUMBO_INSERTION_MODE_IN_ROW;
+    case GUMBO_TAG_TBODY:
+    case GUMBO_TAG_THEAD:
+    case GUMBO_TAG_TFOOT:
+      return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
+    case GUMBO_TAG_CAPTION:
+      return GUMBO_INSERTION_MODE_IN_CAPTION;
+    case GUMBO_TAG_COLGROUP:
+      return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
+    case GUMBO_TAG_TABLE:
+      return GUMBO_INSERTION_MODE_IN_TABLE;
+    case GUMBO_TAG_HEAD:
+    case GUMBO_TAG_BODY:
+      return GUMBO_INSERTION_MODE_IN_BODY;
+    case GUMBO_TAG_FRAMESET:
+      return GUMBO_INSERTION_MODE_IN_FRAMESET;
+    case GUMBO_TAG_HTML:
+      return GUMBO_INSERTION_MODE_BEFORE_HEAD;
+    default:
+      return is_last ?
+          GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
+  }
+}
+
+// This performs the actual "reset the insertion mode" loop.
+static void reset_insertion_mode_appropriately(GumboParser* parser) {
+  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  for (int i = open_elements->length; --i >= 0; ) {
+    GumboInsertionMode mode =
+        get_appropriate_insertion_mode(open_elements->data[i], i == 0);
+    if (mode != GUMBO_INSERTION_MODE_INITIAL) {
+      set_insertion_mode(parser, mode);
+      return;
+    }
+  }
+  // Should never get here, because is_last will be set on the last iteration
+  // and will force GUMBO_INSERTION_MODE_IN_BODY.
+  assert(0);
+}
+
+static GumboError* add_parse_error(GumboParser* parser, const GumboToken* token) {
+  gumbo_debug("Adding parse error.\n");
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return NULL;
+  }
+  error->type = GUMBO_ERR_PARSER;
+  error->position = token->position;
+  error->original_text = token->original_text.data;
+  GumboParserError* extra_data = &error->v.parser;
+  extra_data->input_type = token->type;
+  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
+  if (token->type == GUMBO_TOKEN_START_TAG) {
+    extra_data->input_tag = token->v.start_tag.tag;
+  } else if (token->type == GUMBO_TOKEN_END_TAG) {
+    extra_data->input_tag = token->v.end_tag;
+  }
+  GumboParserState* state = parser->_parser_state;
+  extra_data->parser_state = state->_insertion_mode;
+  gumbo_vector_init(parser, state->_open_elements.length,
+                   &extra_data->tag_stack);
+  for (int i = 0; i < state->_open_elements.length; ++i) {
+    const GumboNode* node = state->_open_elements.data[i];
+    assert(node->type == GUMBO_NODE_ELEMENT);
+    gumbo_vector_add(parser, (void*) node->v.element.tag,
+                    &extra_data->tag_stack);
+  }
+  return error;
+}
+
+// Returns true if the specified token is either a start or end tag (specified
+// by is_start) with one of the tag types in the varargs list.  Terminate the
+// list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
+// the spec references tags that are not in the spec.
+// TODO(jdtang): A lot of the tag lists for this function are repeated in many
+// places in the code.  This is how it's written in the spec (and it's done this
+// way so it's easy to verify the code against the spec), but it may be worth
+// coming up with a notion of a "tag set" that includes a list of tags, and
+// using that in many places.  It'd probably also help performance, but I want
+// to profile before optimizing.
+static bool tag_in(const GumboToken* token, bool is_start, ...) {
+  GumboTag token_tag;
+  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
+    token_tag = token->v.start_tag.tag;
+  } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
+    token_tag = token->v.end_tag;
+  } else {
+    return false;
+  }
+
+  va_list tags;
+  va_start(tags, is_start);
+  bool result = false;
+  for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
+       tag = va_arg(tags, GumboTag)) {
+    if (tag == token_tag) {
+      result = true;
+      break;
+    }
+  }
+  va_end(tags);
+  return result;
+}
+
+// Like tag_in, but for the single-tag case.
+static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
+  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
+    return token->v.start_tag.tag == tag;
+  } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
+    return token->v.end_tag == tag;
+  } else {
+    return false;
+  }
+}
+
+// Like tag_in, but checks for the tag of a node, rather than a token.
+static bool node_tag_in(const GumboNode* node, ...) {
+  assert(node != NULL);
+  if (node->type != GUMBO_NODE_ELEMENT) {
+    return false;
+  }
+  GumboTag node_tag = node->v.element.tag;
+
+  va_list tags;
+  va_start(tags, node);
+  bool result = false;
+  for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
+       tag = va_arg(tags, GumboTag)) {
+    assert(tag <= GUMBO_TAG_LAST);
+    if (tag == node_tag) {
+      result = true;
+      break;
+    }
+  }
+  va_end(tags);
+  return result;
+}
+
+// Like node_tag_in, but for the single-tag case.
+static bool node_tag_is(const GumboNode* node, GumboTag tag) {
+  return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
+static bool is_mathml_integration_point(const GumboNode* node) {
+  return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
+                     GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
+      node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
+static bool is_html_integration_point(const GumboNode* node) {
+  return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
+                      GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
+      node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
+      (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
+          attribute_matches(&node->v.element.attributes,
+                            "encoding", "text/html") ||
+          attribute_matches(&node->v.element.attributes,
+                            "encoding", "application/xhtml+xml")));
+}
+
+// Appends a node to the end of its parent, setting the "parent" and
+// "index_within_parent" fields appropriately.
+static void append_node(
+    GumboParser* parser, GumboNode* parent, GumboNode* node) {
+  assert(node->parent == NULL);
+  assert(node->index_within_parent = -1);
+  GumboVector* children;
+  if (parent->type == GUMBO_NODE_ELEMENT) {
+    children = &parent->v.element.children;
+  } else {
+    assert(parent->type == GUMBO_NODE_DOCUMENT);
+    children = &parent->v.document.children;
+  }
+  node->parent = parent;
+  node->index_within_parent = children->length;
+  gumbo_vector_add(parser, (void*) node, children);
+  assert(node->index_within_parent < children->length);
+}
+
+// Inserts a node at the specified index within its parent, updating the
+// "parent" and "index_within_parent" fields of it and all its siblings.
+static void insert_node(
+    GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
+  assert(node->parent == NULL);
+  assert(node->index_within_parent = -1);
+  assert(parent->type == GUMBO_NODE_ELEMENT);
+  GumboVector* children = &parent->v.element.children;
+  assert(index >= 0);
+  assert(index < children->length);
+  node->parent = parent;
+  node->index_within_parent = index;
+  gumbo_vector_insert_at(parser, (void*) node, index, children);
+  assert(node->index_within_parent < children->length);
+  for (int i = index + 1; i < children->length; ++i) {
+    GumboNode* sibling = children->data[i];
+    sibling->index_within_parent = i;
+    assert(sibling->index_within_parent < children->length);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
+static void foster_parent_element(GumboParser* parser, GumboNode* node) {
+  GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  assert(open_elements->length > 2);
+
+  node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
+  GumboNode* foster_parent_element = open_elements->data[0];
+  assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
+  assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
+  for (int i = open_elements->length; --i > 1; ) {
+    GumboNode* table_element = open_elements->data[i];
+    if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
+      foster_parent_element = table_element->parent;
+      if (!foster_parent_element ||
+          foster_parent_element->type != GUMBO_NODE_ELEMENT) {
+        // Table has no parent; spec says it's possible if a script manipulated
+        // the DOM, although I don't think we have to worry about this case.
+        gumbo_debug("Table has no parent.\n");
+        foster_parent_element = open_elements->data[i - 1];
+        break;
+      }
+      assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
+      gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
+                 table_element, i, gumbo_normalized_tagname(
+                     foster_parent_element->v.element.tag),
+                 table_element->index_within_parent);
+      assert(foster_parent_element->v.element.children.data[
+             table_element->index_within_parent] == table_element);
+      insert_node(parser, foster_parent_element,
+                  table_element->index_within_parent, node);
+      return;
+    }
+  }
+  if (node->type == GUMBO_NODE_ELEMENT) {
+    gumbo_vector_add(parser, (void*) node, open_elements);
+  }
+  append_node(parser, foster_parent_element, node);
+}
+
+static void maybe_flush_text_node_buffer(GumboParser* parser) {
+  GumboParserState* state = parser->_parser_state;
+  TextNodeBufferState* buffer_state = &state->_text_node;
+  if (buffer_state->_buffer.length == 0) {
+    return;
+  }
+
+  assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
+         buffer_state->_type == GUMBO_NODE_TEXT);
+  GumboNode* text_node = create_node(parser, buffer_state->_type);
+  GumboText* text_node_data = &text_node->v.text;
+  text_node_data->text = gumbo_string_buffer_to_string(
+      parser, &buffer_state->_buffer);
+  text_node_data->original_text.data = buffer_state->_start_original_text;
+  text_node_data->original_text.length =
+      state->_current_token->original_text.data -
+      buffer_state->_start_original_text;
+  text_node_data->start_pos = buffer_state->_start_position;
+  if (state->_foster_parent_insertions && node_tag_in(
+      get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
+      GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
+    foster_parent_element(parser, text_node);
+  } else {
+    append_node(
+        parser, parser->_output->root ?
+        get_current_node(parser) : parser->_output->document, text_node);
+  }
+  gumbo_debug("Flushing text node buffer of %.*s.\n",
+             (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
+
+  gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
+  gumbo_string_buffer_init(parser, &buffer_state->_buffer);
+  buffer_state->_type = GUMBO_NODE_WHITESPACE;
+  assert(buffer_state->_buffer.length == 0);
+}
+
+static void record_end_of_element(
+    GumboToken* current_token, GumboElement* element) {
+  element->end_pos = current_token->position;
+  element->original_end_tag =
+      current_token->type == GUMBO_TOKEN_END_TAG ?
+      current_token->original_text : kGumboEmptyString;
+}
+
+static GumboNode* pop_current_node(GumboParser* parser) {
+  GumboParserState* state = parser->_parser_state;
+  maybe_flush_text_node_buffer(parser);
+  if (state->_open_elements.length > 0) {
+    assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
+    gumbo_debug(
+        "Popping %s node.\n",
+        gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
+  }
+  GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
+  if (!current_node) {
+    assert(state->_open_elements.length == 0);
+    return NULL;
+  }
+  assert(current_node->type == GUMBO_NODE_ELEMENT);
+  bool is_closed_body_or_html_tag =
+      (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
+      (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
+  if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
+       !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
+       !is_closed_body_or_html_tag) {
+    current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
+  }
+  if (!is_closed_body_or_html_tag) {
+    record_end_of_element(state->_current_token, &current_node->v.element);
+  }
+  return current_node;
+}
+
+static void append_comment_node(
+    GumboParser* parser, GumboNode* node, const GumboToken* token) {
+  maybe_flush_text_node_buffer(parser);
+  GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
+  comment->type = GUMBO_NODE_COMMENT;
+  comment->parse_flags = GUMBO_INSERTION_NORMAL;
+  comment->v.text.text = token->v.text;
+  comment->v.text.original_text = token->original_text;
+  comment->v.text.start_pos = token->position;
+  append_node(parser, node, comment);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
+static void clear_stack_to_table_row_context(GumboParser* parser) {
+  while (!node_tag_in(get_current_node(parser),
+                      GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
+    pop_current_node(parser);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
+static void clear_stack_to_table_context(GumboParser* parser) {
+  while (!node_tag_in(get_current_node(parser),
+                      GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
+    pop_current_node(parser);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
+void clear_stack_to_table_body_context(GumboParser* parser) {
+  while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
+                      GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
+                      GUMBO_TAG_LAST)) {
+    pop_current_node(parser);
+  }
+}
+
+// Creates a parser-inserted element in the HTML namespace and returns it.
+static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
+  GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
+  GumboElement* element = &node->v.element;
+  gumbo_vector_init(parser, 1, &element->children);
+  gumbo_vector_init(parser, 0, &element->attributes);
+  element->tag = tag;
+  element->tag_namespace = GUMBO_NAMESPACE_HTML;
+  element->original_tag = kGumboEmptyString;
+  element->original_end_tag = kGumboEmptyString;
+  element->start_pos = parser->_parser_state->_current_token->position;
+  element->end_pos = kGumboEmptySourcePosition;
+  return node;
+}
+
+// Constructs an element from the given start tag token.
+static GumboNode* create_element_from_token(
+    GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  GumboTokenStartTag* start_tag = &token->v.start_tag;
+
+  GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
+  GumboElement* element = &node->v.element;
+  gumbo_vector_init(parser, 1, &element->children);
+  element->attributes = start_tag->attributes;
+  element->tag = start_tag->tag;
+  element->tag_namespace = tag_namespace;
+
+  assert(token->original_text.length >= 2);
+  assert(token->original_text.data[0] == '<');
+  assert(token->original_text.data[token->original_text.length - 1] == '>');
+  element->original_tag = token->original_text;
+  element->start_pos = token->position;
+  element->original_end_tag = kGumboEmptyString;
+  element->end_pos = kGumboEmptySourcePosition;
+
+  // The element takes ownership of the attributes from the token, so any
+  // allocated-memory fields should be nulled out.
+  start_tag->attributes = kGumboEmptyVector;
+  return node;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
+static void insert_element(GumboParser* parser, GumboNode* node,
+                           bool is_reconstructing_formatting_elements) {
+  GumboParserState* state = parser->_parser_state;
+  // NOTE(jdtang): The text node buffer must always be flushed before inserting
+  // a node, otherwise we're handling nodes in a different order than the spec
+  // mandated.  However, one clause of the spec (character tokens in the body)
+  // requires that we reconstruct the active formatting elements *before* adding
+  // the character, and reconstructing the active formatting elements may itself
+  // result in the insertion of new elements (which should be pushed onto the
+  // stack of open elements before the buffer is flushed).  We solve this (for
+  // the time being, the spec has been rewritten for <template> and the new
+  // version may be simpler here) with a boolean flag to this method.
+  if (!is_reconstructing_formatting_elements) {
+    maybe_flush_text_node_buffer(parser);
+  }
+  if (state->_foster_parent_insertions && node_tag_in(
+      get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
+      GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
+    foster_parent_element(parser, node);
+    gumbo_vector_add(parser, (void*) node, &state->_open_elements);
+    return;
+  }
+
+  // This is called to insert the root HTML element, but get_current_node
+  // assumes the stack of open elements is non-empty, so we need special
+  // handling for this case.
+  append_node(
+      parser, parser->_output->root ?
+      get_current_node(parser) : parser->_output->document, node);
+  gumbo_vector_add(parser, (void*) node, &state->_open_elements);
+}
+
+// Convenience method that combines create_element_from_token and
+// insert_element, inserting the generated element directly into the current
+// node.  Returns the node inserted.
+static GumboNode* insert_element_from_token(
+    GumboParser* parser, GumboToken* token) {
+  GumboNode* element =
+      create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
+  insert_element(parser, element, false);
+  gumbo_debug("Inserting <%s> element (@%x) from token.\n",
+             gumbo_normalized_tagname(element->v.element.tag), element);
+  return element;
+}
+
+// Convenience method that combines create_element and insert_element, inserting
+// a parser-generated element of a specific tag type.  Returns the node
+// inserted.
+static GumboNode* insert_element_of_tag_type(
+    GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
+  GumboNode* element = create_element(parser, tag);
+  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
+  insert_element(parser, element, false);
+  gumbo_debug("Inserting %s element (@%x) from tag type.\n",
+             gumbo_normalized_tagname(tag), element);
+  return element;
+}
+
+// Convenience method for creating foreign namespaced element.  Returns the node
+// inserted.
+static GumboNode* insert_foreign_element(
+    GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  GumboNode* element = create_element_from_token(parser, token, tag_namespace);
+  insert_element(parser, element, false);
+  if (token_has_attribute(token, "xmlns") &&
+      !attribute_matches_case_sensitive(
+          &token->v.start_tag.attributes, "xmlns",
+          kLegalXmlns[tag_namespace])) {
+    // TODO(jdtang): Since there're multiple possible error codes here, we
+    // eventually need reason codes to differentiate them.
+    add_parse_error(parser, token);
+  }
+  if (token_has_attribute(token, "xmlns:xlink") &&
+      !attribute_matches_case_sensitive(
+          &token->v.start_tag.attributes,
+          "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
+    add_parse_error(parser, token);
+  }
+  return element;
+}
+
+static void insert_text_token(GumboParser* parser, GumboToken* token) {
+  assert(token->type == GUMBO_TOKEN_WHITESPACE ||
+         token->type == GUMBO_TOKEN_CHARACTER);
+  TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
+  if (buffer_state->_buffer.length == 0) {
+    // Initialize position fields.
+    buffer_state->_start_original_text = token->original_text.data;
+    buffer_state->_start_position = token->position;
+  }
+  gumbo_string_buffer_append_codepoint(
+      parser, token->v.character, &buffer_state->_buffer);
+  if (token->type == GUMBO_TOKEN_CHARACTER) {
+    buffer_state->_type = GUMBO_NODE_TEXT;
+  }
+  gumbo_debug("Inserting text token '%c'.\n", token->v.character);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
+static void run_generic_parsing_algorithm(
+    GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
+  insert_element_from_token(parser, token);
+  gumbo_tokenizer_set_state(parser, lexer_state);
+  parser->_parser_state->_original_insertion_mode =
+      parser->_parser_state->_insertion_mode;
+  parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
+}
+
+static void acknowledge_self_closing_tag(GumboParser* parser) {
+  parser->_parser_state->_self_closing_flag_acknowledged = true;
+}
+
+// Returns true if there's an anchor tag in the list of active formatting
+// elements, and fills in its index if so.
+static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
+  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
+  for (int i = elements->length; --i >= 0; ) {
+    GumboNode* node = elements->data[i];
+    if (node == &kActiveFormattingScopeMarker) {
+      return false;
+    }
+    if (node_tag_is(node, GUMBO_TAG_A)) {
+      *anchor_index = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Counts the number of open formatting elements in the list of active
+// formatting elements (after the last active scope marker) that have a specific
+// tag.  If this is > 0, then earliest_matching_index will be filled in with the
+// index of the first such element.
+static int count_formatting_elements_of_tag(
+    GumboParser* parser, const GumboNode* desired_node,
+    int* earliest_matching_index) {
+  const GumboElement* desired_element = &desired_node->v.element;
+  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
+  int num_identical_elements = 0;
+  for (int i = elements->length; --i >= 0; ) {
+    GumboNode* node = elements->data[i];
+    if (node == &kActiveFormattingScopeMarker) {
+      break;
+    }
+    assert(node->type == GUMBO_NODE_ELEMENT);
+    GumboElement* element = &node->v.element;
+    if (node_tag_is(node, desired_element->tag) &&
+        element->tag_namespace == desired_element->tag_namespace &&
+        all_attributes_match(&element->attributes,
+                             &desired_element->attributes)) {
+      num_identical_elements++;
+      *earliest_matching_index = i;
+    }
+  }
+  return num_identical_elements;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
+static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
+  assert(node == &kActiveFormattingScopeMarker ||
+         node->type == GUMBO_NODE_ELEMENT);
+  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
+  if (node == &kActiveFormattingScopeMarker) {
+    gumbo_debug("Adding a scope marker.\n");
+  } else {
+    gumbo_debug("Adding a formatting element.\n");
+  }
+
+  // Hunt for identical elements.
+  int earliest_identical_element = elements->length;
+  int num_identical_elements = count_formatting_elements_of_tag(
+      parser, node, &earliest_identical_element);
+
+  // Noah's Ark clause: if there're at least 3, remove the earliest.
+  if (num_identical_elements >= 3) {
+    gumbo_debug("Noah's ark clause: removing element at %d.\n",
+                earliest_identical_element);
+    gumbo_vector_remove_at(parser, earliest_identical_element, elements);
+  }
+
+  gumbo_vector_add(parser, (void*) node, elements);
+}
+
+static bool is_open_element(GumboParser* parser, const GumboNode* node) {
+  GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  for (int i = 0; i < open_elements->length; ++i) {
+    if (open_elements->data[i] == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Clones attributes, tags, etc. of a node, but does not copy the content.  The
+// clone shares no structure with the original node: all owned strings and
+// values are fresh copies.
+GumboNode* clone_node(
+    GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
+  assert(node->type == GUMBO_NODE_ELEMENT);
+  GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
+  *new_node = *node;
+  new_node->parent = NULL;
+  new_node->index_within_parent = -1;
+  // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
+  // have a separate end tag.
+  new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
+  new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
+  GumboElement* element = &new_node->v.element;
+  gumbo_vector_init(parser, 1, &element->children);
+
+  const GumboVector* old_attributes = &node->v.element.attributes;
+  gumbo_vector_init(parser, old_attributes->length, &element->attributes);
+  for (int i = 0; i < old_attributes->length; ++i) {
+    const GumboAttribute* old_attr = old_attributes->data[i];
+    GumboAttribute* attr =
+        gumbo_parser_allocate(parser, sizeof(GumboAttribute));
+    *attr = *old_attr;
+    attr->name = gumbo_copy_stringz(parser, old_attr->name);
+    attr->value = gumbo_copy_stringz(parser, old_attr->value);
+    gumbo_vector_add(parser, attr, &element->attributes);
+  }
+  return new_node;
+}
+
+// "Reconstruct active formatting elements" part of the spec.
+// This implementation is based on the html5lib translation from the mess of
+// GOTOs in the spec to reasonably structured programming.
+// http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
+static void reconstruct_active_formatting_elements(GumboParser* parser) {
+  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
+  // Step 1
+  if (elements->length == 0) {
+    return;
+  }
+
+  // Step 2 & 3
+  int i = elements->length - 1;
+  const GumboNode* element = elements->data[i];
+  if (element == &kActiveFormattingScopeMarker ||
+      is_open_element(parser, element)) {
+    return;
+  }
+
+  // Step 6
+  do {
+    if (i == 0) {
+      // Step 4
+      i = -1;   // Incremented to 0 below.
+      break;
+    }
+    // Step 5
+    element = elements->data[--i];
+  } while (element != &kActiveFormattingScopeMarker &&
+           !is_open_element(parser, element));
+
+  ++i;
+  gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
+              gumbo_normalized_tagname(
+                  get_current_node(parser)->v.element.tag));
+  for(; i < elements->length; ++i) {
+    // Step 7 & 8.
+    assert(elements->length > 0);
+    assert(i < elements->length);
+    element = elements->data[i];
+    assert(element != &kActiveFormattingScopeMarker);
+    GumboNode* clone = clone_node(
+        parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
+    // Step 9.
+    insert_element(parser, clone, true);
+    // Step 10.
+    elements->data[i] = clone;
+    gumbo_debug("Reconstructed %s element at %d.\n",
+               gumbo_normalized_tagname(clone->v.element.tag), i);
+  }
+}
+
+static void clear_active_formatting_elements(GumboParser* parser) {
+  GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
+  int num_elements_cleared = 0;
+  const GumboNode* node;
+  do {
+    node = gumbo_vector_pop(parser, elements);
+    ++num_elements_cleared;
+  } while(node && node != &kActiveFormattingScopeMarker);
+  gumbo_debug("Cleared %d elements from active formatting list.\n",
+              num_elements_cleared);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
+static GumboQuirksModeEnum compute_quirks_mode(
+    const GumboTokenDocType* doctype) {
+  if (doctype->force_quirks ||
+      strcmp(doctype->name, kDoctypeHtml.data) ||
+      is_in_static_list(doctype->public_identifier,
+                        kQuirksModePublicIdPrefixes, false) ||
+      is_in_static_list(doctype->public_identifier,
+                        kQuirksModePublicIdExactMatches, true) ||
+      is_in_static_list(doctype->system_identifier,
+                        kQuirksModeSystemIdExactMatches, true) ||
+      (is_in_static_list(doctype->public_identifier,
+                         kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
+       && !doctype->has_system_identifier)) {
+    return GUMBO_DOCTYPE_QUIRKS;
+  } else if (
+      is_in_static_list(doctype->public_identifier,
+                        kLimitedQuirksPublicIdPrefixes, false) ||
+      (is_in_static_list(doctype->public_identifier,
+                         kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
+       && doctype->has_system_identifier)) {
+    return GUMBO_DOCTYPE_LIMITED_QUIRKS;
+  }
+  return GUMBO_DOCTYPE_NO_QUIRKS;
+}
+
+// The following functions are all defined by the "has an element in __ scope"
+// sections of the HTML5 spec:
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
+// The basic idea behind them is that they check for an element of the given tag
+// name, contained within a scope formed by a set of other tag names.  For
+// example, "has an element in list scope" looks for an element of the given tag
+// within the nearest enclosing <ol> or <ul>, along with a bunch of generic
+// element types that serve to "firewall" their content from the rest of the
+// document.
+static bool has_an_element_in_specific_scope(
+    GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
+  GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  va_list args;
+  va_start(args, negate);
+  // va_arg can only run through the list once, so we copy it to an GumboVector
+  // here.  I wonder if it'd make more sense to make tags the GumboVector*
+  // parameter and 'expected' a vararg list, but that'd require changing a lot
+  // of code for unknown benefit.  We may want to change the representation of
+  // these tag sets anyway, to something more efficient.
+  GumboVector tags;
+  gumbo_vector_init(parser, 10, &tags);
+  for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
+       tag = va_arg(args, GumboTag)) {
+    // We store the tags inline instead of storing pointers to them.
+    gumbo_vector_add(parser, (void*) tag, &tags);
+  }
+  va_end(args);
+
+  bool result = false;
+  for (int i = open_elements->length; --i >= 0; ) {
+    const GumboNode* node = open_elements->data[i];
+    if (node->type != GUMBO_NODE_ELEMENT) {
+      continue;
+    }
+    GumboTag node_tag = node->v.element.tag;
+    for (int j = 0; j < expected->length; ++j) {
+      GumboTag expected_tag = (GumboTag) expected->data[j];
+      if (node_tag == expected_tag) {
+        result = true;
+        goto cleanup;
+      }
+    }
+
+    bool found_tag = false;
+    for (int j = 0; j < tags.length; ++j) {
+      GumboTag tag = (GumboTag) tags.data[j];
+      if (tag == node_tag) {
+        found_tag = true;
+        break;
+      }
+    }
+    if (negate != found_tag) {
+      result = false;
+      goto cleanup;
+    }
+  }
+cleanup:
+  gumbo_vector_destroy(parser, &tags);
+  return result;
+}
+
+// This is a bit of a hack to stack-allocate a one-element GumboVector name
+// 'varname' containing the 'from_var' variable, since it's used in nearly all
+// the subsequent helper functions.  Note the use of void* and casts instead of
+// GumboTag; this is so the alignment requirements are the same as GumboVector
+// and the data inside it can be freely accessed as if it were a normal
+// GumboVector.
+#define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
+    void* varname ## _tmp_array[1] = { (void*) from_var }; \
+    GumboVector varname = { varname ## _tmp_array, 1, 1 }
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
+static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
+  DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
+  return has_an_element_in_specific_scope(
+      parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
+      GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
+      GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
+      GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
+      GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
+}
+
+// Like "has an element in scope", but for the specific case of looking for a
+// unique target node, not for any node with a given tag name.  This duplicates
+// much of the algorithm from has_an_element_in_specific_scope because the
+// predicate is different when checking for an exact node, and it's easier &
+// faster just to duplicate the code for this one case than to try and
+// parameterize it.
+static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
+  GumboVector* open_elements = &parser->_parser_state->_open_elements;
+  for (int i = open_elements->length; --i >= 0; ) {
+    const GumboNode* current = open_elements->data[i];
+    if (current == node) {
+      return true;
+    }
+    if (current->type != GUMBO_NODE_ELEMENT) {
+      continue;
+    }
+    if (node_tag_in(
+        current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
+        GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
+        GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
+        GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
+        GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
+        GUMBO_TAG_LAST)) {
+      return false;
+    }
+  }
+  assert(false);
+  return false;
+}
+
+// Like has_an_element_in_scope, but restricts the expected tag to a range of
+// possible tag names instead of just a single one.
+static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
+  GumboVector tags;
+  // 6 = arbitrary initial size for vector, chosen because the major use-case
+  // for this method is heading tags, of which there are 6.
+  gumbo_vector_init(parser, 6, &tags);
+  va_list args;
+  va_start(args, parser);
+  for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
+       tag = va_arg(args, GumboTag)) {
+    gumbo_vector_add(parser, (void*) tag, &tags);
+  }
+  bool found = has_an_element_in_specific_scope(
+      parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
+      GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
+      GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
+      GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
+      GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
+  gumbo_vector_destroy(parser, &tags);
+  va_end(args);
+  return found;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
+static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
+  DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
+  return has_an_element_in_specific_scope(
+      parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
+      GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
+      GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
+      GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
+      GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
+      GUMBO_TAG_LAST);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
+static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
+  DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
+  return has_an_element_in_specific_scope(
+      parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
+      GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
+      GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
+      GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
+      GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
+static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
+  DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
+  return has_an_element_in_specific_scope(
+      parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
+static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
+  DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
+  return has_an_element_in_specific_scope(
+      parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
+      GUMBO_TAG_LAST);
+}
+
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
+// "exception" is the "element to exclude from the process" listed in the spec.
+// Pass GUMBO_TAG_LAST to not exclude any of them.
+static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
+  for (;
+       node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
+                   GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
+                   GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
+       !node_tag_is(get_current_node(parser), exception);
+       pop_current_node(parser));
+}
+
+// This factors out the clauses relating to "act as if an end tag token with tag
+// name "table" had been seen.  Returns true if there's a table element in table
+// scope which was successfully closed, false if not and the token should be
+// ignored.  Does not add parse errors; callers should handle that.
+static bool close_table(GumboParser* parser) {
+  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
+    return false;
+  }
+
+  GumboNode* node = pop_current_node(parser);
+  while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
+    node = pop_current_node(parser);
+  }
+  reset_insertion_mode_appropriately(parser);
+  return true;
+}
+
+// This factors out the clauses relating to "act as if an end tag token with tag
+// name `cell_tag` had been seen".
+static bool close_table_cell(GumboParser* parser, const GumboToken* token,
+                             GumboTag cell_tag) {
+  bool result = true;
+  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
+  const GumboNode* node = get_current_node(parser);
+  if (!node_tag_is(node, cell_tag)) {
+    add_parse_error(parser, token);
+    result = false;
+  }
+  do {
+    node = pop_current_node(parser);
+  } while (!node_tag_is(node, cell_tag));
+
+  clear_active_formatting_elements(parser);
+  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
+  return result;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
+// This holds the logic to determine whether we should close a <td> or a <th>.
+static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
+  if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
+    assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
+    return close_table_cell(parser, token, GUMBO_TAG_TD);
+  } else {
+    assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
+    return close_table_cell(parser, token, GUMBO_TAG_TH);
+  }
+}
+
+// This factors out the "act as if an end tag of tag name 'select' had been
+// seen" clause of the spec, since it's referenced in several places.  It pops
+// all nodes from the stack until the current <select> has been closed, then
+// resets the insertion mode appropriately.
+static void close_current_select(GumboParser* parser) {
+  GumboNode* node = pop_current_node(parser);
+  while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
+    node = pop_current_node(parser);
+  }
+  reset_insertion_mode_appropriately(parser);
+}
+
+// The list of nodes in the "special" category:
+// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
+static bool is_special_node(const GumboNode* node) {
+  assert(node->type == GUMBO_NODE_ELEMENT);
+  switch (node->v.element.tag_namespace) {
+    case GUMBO_NAMESPACE_HTML:
+      return node_tag_in(node,
+           GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
+           GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
+           GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
+           GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
+           GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
+           GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
+           GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
+           GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
+           GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
+           GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
+           GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
+           GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
+           GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
+           GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
+           GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
+           GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
+           GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
+           GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
+           GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
+           GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
+           GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
+           GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
+    case GUMBO_NAMESPACE_MATHML:
+      return node_tag_in(node,
+          GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
+          GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
+    case GUMBO_NAMESPACE_SVG:
+      return node_tag_in(node,
+          GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
+  }
+  abort();
+  return false;  // Pacify compiler.
+}
+
+// Implicitly closes currently open tags until it reaches an element with the
+// specified tag name.  If the elements closed are in the set handled by
+// generate_implied_end_tags, this is normal operation and this function returns
+// true.  Otherwise, a parse error is recorded and this function returns false.
+static bool implicitly_close_tags(
+    GumboParser* parser, GumboToken* token, GumboTag target) {
+  bool result = true;
+  generate_implied_end_tags(parser, target);
+  if (!node_tag_is(get_current_node(parser), target)) {
+    add_parse_error(parser, token);
+    while (!node_tag_is(get_current_node(parser), target)) {
+      pop_current_node(parser);
+    }
+    result = false;
+  }
+  assert(node_tag_is(get_current_node(parser), target));
+  pop_current_node(parser);
+  return result;
+}
+
+// If the stack of open elements has a <p> tag in button scope, this acts as if
+// a </p> tag was encountered, implicitly closing tags.  Returns false if a
+// parse error occurs.  This is a convenience function because this particular
+// clause appears several times in the spec.
+static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
+  if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
+    return implicitly_close_tags(parser, token, GUMBO_TAG_P);
+  }
+  return true;
+}
+
+// Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
+// tags.  Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
+static void maybe_implicitly_close_list_tag(
+    GumboParser* parser, GumboToken* token, bool is_li) {
+  GumboParserState* state = parser->_parser_state;
+  state->_frameset_ok = false;
+  for (int i = state->_open_elements.length; --i >= 0; ) {
+    const GumboNode* node = state->_open_elements.data[i];
+    bool is_list_tag = is_li ?
+        node_tag_is(node, GUMBO_TAG_LI) :
+        node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
+    if (is_list_tag) {
+      implicitly_close_tags(parser, token, node->v.element.tag);
+      return;
+    }
+    if (is_special_node(node) &&
+        !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
+                     GUMBO_TAG_LAST)) {
+      return;
+    }
+  }
+}
+
+static void merge_attributes(
+    GumboParser* parser, GumboToken* token, GumboNode* node) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  assert(node->type == GUMBO_NODE_ELEMENT);
+  const GumboVector* token_attr = &token->v.start_tag.attributes;
+  GumboVector* node_attr = &node->v.element.attributes;
+
+  for (int i = 0; i < token_attr->length; ++i) {
+    GumboAttribute* attr = token_attr->data[i];
+    if (!gumbo_get_attribute(node_attr, attr->name)) {
+      // Ownership of the attribute is transferred by this gumbo_vector_add,
+      // so it has to be nulled out of the original token so it doesn't get
+      // double-deleted.
+      gumbo_vector_add(parser, attr, node_attr);
+      token_attr->data[i] = NULL;
+    }
+  }
+  // When attributes are merged, it means the token has been ignored and merged
+  // with another token, so we need to free its memory.  The attributes that are
+  // transferred need to be nulled-out in the vector above so that they aren't
+  // double-deleted.
+  gumbo_token_destroy(parser, token);
+
+#ifndef NDEBUG
+  // Mark this sentinel so the assertion in the main loop knows it's been
+  // destroyed.
+  token->v.start_tag.attributes = kGumboEmptyVector;
+#endif
+}
+
+const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
+  for (int i = 0;
+       i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); ++i) {
+    const ReplacementEntry* entry = &kSvgTagReplacements[i];
+    if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
+      return entry->to.data;
+    }
+  }
+  return NULL;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
+// This destructively modifies any matching attributes on the token and sets the
+// namespace appropriately.
+static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  const GumboVector* attributes = &token->v.start_tag.attributes;
+  for (int i = 0;
+       i < sizeof(kForeignAttributeReplacements) /
+       sizeof(NamespacedAttributeReplacement); ++i) {
+    const NamespacedAttributeReplacement* entry =
+        &kForeignAttributeReplacements[i];
+    GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
+    if (!attr) {
+      continue;
+    }
+    gumbo_parser_deallocate(parser, (void*) attr->name);
+    attr->attr_namespace = entry->attr_namespace;
+    attr->name = gumbo_copy_stringz(parser, entry->local_name);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
+// This destructively modifies any matching attributes on the token.
+static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  const GumboVector* attributes = &token->v.start_tag.attributes;
+  for (int i = 0;
+       i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
+    const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
+    GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
+    if (!attr) {
+      continue;
+    }
+    gumbo_parser_deallocate(parser, (void*) attr->name);
+    attr->name = gumbo_copy_stringz(parser, entry->to.data);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
+// Note that this may destructively modify the token with the new attribute
+// value.
+static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
+  assert(token->type == GUMBO_TOKEN_START_TAG);
+  GumboAttribute* attr = gumbo_get_attribute(
+      &token->v.start_tag.attributes, "definitionurl");
+  if (!attr) {
+    return;
+  }
+  gumbo_parser_deallocate(parser, (void*) attr->name);
+  attr->name = gumbo_copy_stringz(parser, "definitionURL");
+}
+
+static bool doctype_matches(
+    const GumboTokenDocType* doctype,
+    const GumboStringPiece* public_id,
+    const GumboStringPiece* system_id,
+    bool allow_missing_system_id) {
+  return !strcmp(doctype->public_identifier, public_id->data) &&
+      (allow_missing_system_id || doctype->has_system_identifier) &&
+      !strcmp(doctype->system_identifier, system_id->data);
+}
+
+static bool maybe_add_doctype_error(
+    GumboParser* parser, const GumboToken* token) {
+  const GumboTokenDocType* doctype = &token->v.doc_type;
+  bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
+  if (!html_doctype ||
+      doctype->has_public_identifier ||
+      (doctype->has_system_identifier && !strcmp(
+          doctype->system_identifier, kSystemIdLegacyCompat.data)) ||
+      !(html_doctype && (
+          doctype_matches(doctype, &kPublicIdHtml4_0,
+                          &kSystemIdRecHtml4_0, true) ||
+          doctype_matches(doctype, &kPublicIdHtml4_01, &kSystemIdHtml4, true) ||
+          doctype_matches(doctype, &kPublicIdXhtml1_0,
+                          &kSystemIdXhtmlStrict1_1, false) ||
+          doctype_matches(doctype, &kPublicIdXhtml1_1,
+                          &kSystemIdXhtml1_1, false)))) {
+    add_parse_error(parser, token);
+    return false;
+  }
+  return true;
+}
+
+static void remove_from_parent(GumboParser* parser, GumboNode* node) {
+  if (!node->parent) {
+    // The node may not have a parent if, for example, it is a newly-cloned copy
+    // of an active formatting element.  DOM manipulations continue with the
+    // orphaned fragment of the DOM tree until it's appended/foster-parented to
+    // the common ancestor at the end of the adoption agency algorithm.
+    return;
+  }
+  assert(node->parent->type == GUMBO_NODE_ELEMENT);
+  GumboVector* children = &node->parent->v.element.children;
+  int index = gumbo_vector_index_of(children, node);
+  assert(index != -1);
+
+  gumbo_vector_remove_at(parser, index, children);
+  node->parent = NULL;
+  node->index_within_parent = -1;
+  for (int i = index; i < children->length; ++i) {
+    GumboNode* child = children->data[i];
+    child->index_within_parent = i;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
+// Also described in the "in body" handling for end formatting tags.
+static bool adoption_agency_algorithm(
+    GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
+  GumboParserState* state = parser->_parser_state;
+  gumbo_debug("Entering adoption agency algorithm.\n");
+  // Steps 1-3 & 16:
+  for (int i = 0; i < 8; ++i) {
+    // Step 4.
+    GumboNode* formatting_node = NULL;
+    int formatting_node_in_open_elements = -1;
+    for (int j = state->_active_formatting_elements.length; --j >= 0; ) {
+      GumboNode* current_node = state->_active_formatting_elements.data[j];
+      if (current_node == &kActiveFormattingScopeMarker) {
+        gumbo_debug("Broke on scope marker; aborting.\n");
+        // Last scope marker; abort the algorithm.
+        return false;
+      }
+      if (node_tag_is(current_node, closing_tag)) {
+        // Found it.
+        formatting_node = current_node;
+        formatting_node_in_open_elements = gumbo_vector_index_of(
+            &state->_open_elements, formatting_node);
+        gumbo_debug("Formatting element of tag %s at %d.\n",
+                    gumbo_normalized_tagname(closing_tag),
+                    formatting_node_in_open_elements);
+        break;
+      }
+    }
+    if (!formatting_node) {
+      // No matching tag; not a parse error outright, but fall through to the
+      // "any other end tag" clause (which may potentially add a parse error,
+      // but not always).
+      gumbo_debug("No active formatting elements; aborting.\n");
+      return false;
+    }
+
+    if (formatting_node_in_open_elements == -1) {
+      gumbo_debug("Formatting node not on stack of open elements.\n");
+      gumbo_vector_remove(parser, formatting_node,
+                          &state->_active_formatting_elements);
+      return false;
+    }
+
+    if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
+      add_parse_error(parser, token);
+      gumbo_debug("Element not in scope.\n");
+      return false;
+    }
+    if (formatting_node != get_current_node(parser)) {
+      add_parse_error(parser, token);  // But continue onwards.
+    }
+    assert(formatting_node);
+    assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
+    assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
+
+    // Step 5 & 6.
+    GumboNode* furthest_block = NULL;
+    for (int j = formatting_node_in_open_elements;
+         j < state->_open_elements.length; ++j) {
+      assert(j > 0);
+      GumboNode* current = state->_open_elements.data[j];
+      if (is_special_node(current)) {
+        // Step 5.
+        furthest_block = current;
+        break;
+      }
+    }
+    if (!furthest_block) {
+      // Step 6.
+      while (get_current_node(parser) != formatting_node) {
+        pop_current_node(parser);
+      }
+      // And the formatting element itself.
+      pop_current_node(parser);
+      gumbo_vector_remove(parser, formatting_node,
+                          &state->_active_formatting_elements);
+      return false;
+    }
+    assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
+    assert(furthest_block);
+
+    // Step 7.
+    // Elements may be moved and reparented by this algorithm, so
+    // common_ancestor is not necessarily the same as formatting_node->parent.
+    GumboNode* common_ancestor =
+        state->_open_elements.data[gumbo_vector_index_of(
+            &state->_open_elements, formatting_node) - 1];
+    gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
+                gumbo_normalized_tagname(common_ancestor->v.element.tag),
+                gumbo_normalized_tagname(furthest_block->v.element.tag));
+
+    // Step 8.
+    int bookmark = gumbo_vector_index_of(
+        &state->_active_formatting_elements, formatting_node);;
+    // Step 9.
+    GumboNode* node = furthest_block;
+    GumboNode* last_node = furthest_block;
+    // Must be stored explicitly, in case node is removed from the stack of open
+    // elements, to handle step 9.4.
+    int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
+    assert(saved_node_index > 0);
+    // Step 9.1-9.3 & 9.11.
+    for (int j = 0; j < 3; ++j) {
+      // Step 9.4.
+      int node_index = gumbo_vector_index_of(&state->_open_elements, node);
+      gumbo_debug(
+          "Current index: %d, last index: %d.\n", node_index, saved_node_index);
+      if (node_index == -1) {
+        node_index = saved_node_index;
+      }
+      saved_node_index = --node_index;
+      assert(node_index > 0);
+      assert(node_index < state->_open_elements.capacity);
+      node = state->_open_elements.data[node_index];
+      assert(node->parent);
+      // Step 9.5.
+      if (gumbo_vector_index_of(
+          &state->_active_formatting_elements, node) == -1) {
+        gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
+        continue;
+      } else if (node == formatting_node) {
+        // Step 9.6.
+        break;
+      }
+      // Step 9.7.
+      int formatting_index = gumbo_vector_index_of(
+          &state->_active_formatting_elements, node);
+      node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
+      state->_active_formatting_elements.data[formatting_index] = node;
+      state->_open_elements.data[node_index] = node;
+      // Step 9.8.
+      if (last_node == furthest_block) {
+        bookmark = formatting_index + 1;
+        assert(bookmark <= state->_active_formatting_elements.length);
+      }
+      // Step 9.9.
+      last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
+      remove_from_parent(parser, last_node);
+      append_node(parser, node, last_node);
+      // Step 9.10.
+      last_node = node;
+    }
+
+    // Step 10.
+    gumbo_debug("Removing %s node from parent ",
+                gumbo_normalized_tagname(last_node->v.element.tag));
+    remove_from_parent(parser, last_node);
+    last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
+    if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
+                    GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
+                    GUMBO_TAG_LAST)) {
+      gumbo_debug("and foster-parenting it.\n");
+      foster_parent_element(parser, last_node);
+    } else {
+      gumbo_debug("and inserting it into %s.\n",
+                  gumbo_normalized_tagname(common_ancestor->v.element.tag));
+      append_node(parser, common_ancestor, last_node);
+    }
+
+    // Step 11.
+    GumboNode* new_formatting_node = clone_node(
+        parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
+    formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
+
+    // Step 12.  Instead of appending nodes one-by-one, we swap the children
+    // vector of furthest_block with the empty children of new_formatting_node,
+    // reducing memory traffic and allocations.  We still have to reset their
+    // parent pointers, though.
+    GumboVector temp = new_formatting_node->v.element.children;
+    new_formatting_node->v.element.children =
+        furthest_block->v.element.children;
+    furthest_block->v.element.children = temp;
+
+    temp = new_formatting_node->v.element.children;
+    for (int i = 0; i < temp.length; ++i) {
+      GumboNode* child = temp.data[i];
+      child->parent = new_formatting_node;
+    }
+
+    // Step 13.
+    append_node(parser, furthest_block, new_formatting_node);
+
+    // Step 14.
+    // If the formatting node was before the bookmark, it may shift over all
+    // indices after it, so we need to explicitly find the index and possibly
+    // adjust the bookmark.
+    int formatting_node_index = gumbo_vector_index_of(
+        &state->_active_formatting_elements, formatting_node);
+    assert(formatting_node_index != -1);
+    if (formatting_node_index < bookmark) {
+      --bookmark;
+    }
+    gumbo_vector_remove_at(
+        parser, formatting_node_index, &state->_active_formatting_elements);
+    assert(bookmark >= 0);
+    assert(bookmark <= state->_active_formatting_elements.length);
+    gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
+                           &state->_active_formatting_elements);
+
+    // Step 15.
+    gumbo_vector_remove(
+        parser, formatting_node, &state->_open_elements);
+    int insert_at = gumbo_vector_index_of(
+        &state->_open_elements, furthest_block) + 1;
+    assert(insert_at >= 0);
+    assert(insert_at <= state->_open_elements.length);
+    gumbo_vector_insert_at(
+        parser, new_formatting_node, insert_at, &state->_open_elements);
+  }
+  return true;
+}
+
+// This is here to clean up memory when the spec says "Ignore current token."
+static void ignore_token(GumboParser* parser) {
+  GumboToken* token = parser->_parser_state->_current_token;
+  // Ownership of the token's internal buffers are normally transferred to the
+  // element, but if no element is emitted (as happens in non-verbatim-mode
+  // when a token is ignored), we need to free it here to prevent a memory
+  // leak.
+  gumbo_token_destroy(parser, token);
+#ifndef NDEBUG
+  if (token->type == GUMBO_TOKEN_START_TAG) {
+    // Mark this sentinel so the assertion in the main loop knows it's been
+    // destroyed.
+    token->v.start_tag.attributes = kGumboEmptyVector;
+  }
+#endif
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
+static void finish_parsing(GumboParser* parser) {
+  maybe_flush_text_node_buffer(parser);
+  GumboParserState* state = parser->_parser_state;
+  for (GumboNode* node = pop_current_node(parser); node;
+       node = pop_current_node(parser)) {
+    if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
+        (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
+      continue;
+    }
+    node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
+  }
+  while (pop_current_node(parser));  // Pop them all.
+}
+
+static bool handle_initial(GumboParser* parser, GumboToken* token) {
+  GumboDocument* document = &get_document_node(parser)->v.document;
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    ignore_token(parser);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_document_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    document->has_doctype = true;
+    document->name = token->v.doc_type.name;
+    document->public_identifier = token->v.doc_type.public_identifier;
+    document->system_identifier = token->v.doc_type.system_identifier;
+    document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
+    return maybe_add_doctype_error(parser, token);
+  }
+  add_parse_error(parser, token);
+  document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
+  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
+  parser->_parser_state->_reprocess_current_token = true;
+  return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
+static bool handle_before_html(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_document_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    ignore_token(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    GumboNode* html_node = insert_element_from_token(parser, token);
+    parser->_output->root = html_node;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
+      token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+      GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    GumboNode* html_node = insert_element_of_tag_type(
+        parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
+    assert(html_node);
+    parser->_output->root = html_node;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
+    parser->_parser_state->_reprocess_current_token = true;
+    return true;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
+static bool handle_before_head(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    ignore_token(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
+    GumboNode* node = insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
+    parser->_parser_state->_head_element = node;
+    return true;
+  } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
+      token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+      GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    GumboNode* node = insert_element_of_tag_type(
+        parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
+    parser->_parser_state->_head_element = node;
+    parser->_parser_state->_reprocess_current_token = true;
+    return true;
+  }
+}
+
+// Forward declarations because of mutual dependencies.
+static bool handle_token(GumboParser* parser, GumboToken* token);
+static bool handle_in_body(GumboParser* parser, GumboToken* token);
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
+static bool handle_in_head(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
+                    GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
+                    GUMBO_TAG_LAST)) {
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
+    // spec doesn't apply.  If clients want to handle meta-tag re-encoding, they
+    // should specifically look for that string in the document and re-encode it
+    // before passing to Gumbo.
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
+                    GUMBO_TAG_LAST)) {
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
+    return true;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
+    GumboNode* head = pop_current_node(parser);
+    AVOID_UNUSED_VARIABLE_WARNING(head);
+    assert(node_tag_is(head, GUMBO_TAG_HEAD));
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
+             (token->type == GUMBO_TOKEN_END_TAG &&
+              !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+                      GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
+    add_parse_error(parser, token);
+    return false;
+  } else {
+    const GumboNode* node = pop_current_node(parser);
+    assert(node_tag_is(node, GUMBO_TAG_HEAD));
+    AVOID_UNUSED_VARIABLE_WARNING(node);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
+    parser->_parser_state->_reprocess_current_token = true;
+    return true;
+  }
+
+  return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
+static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
+    const GumboNode* node = pop_current_node(parser);
+    assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
+    AVOID_UNUSED_VARIABLE_WARNING(node);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
+             token->type == GUMBO_TOKEN_COMMENT ||
+             tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
+                    GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
+                    GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
+    return handle_in_head(parser, token);
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
+                    GUMBO_TAG_LAST) ||
+            (token->type == GUMBO_TOKEN_END_TAG &&
+             !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    add_parse_error(parser, token);
+    const GumboNode* node = pop_current_node(parser);
+    assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
+    AVOID_UNUSED_VARIABLE_WARNING(node);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
+    parser->_parser_state->_reprocess_current_token = true;
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
+static bool handle_after_head(GumboParser* parser, GumboToken* token) {
+  GumboParserState* state = parser->_parser_state;
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
+    insert_element_from_token(parser, token);
+    state->_frameset_ok = false;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
+                    GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
+                    GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
+                    GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    assert(state->_head_element != NULL);
+    // This must be flushed before we push the head element on, as there may be
+    // pending character tokens that should be attached to the root.
+    maybe_flush_text_node_buffer(parser);
+    gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
+    bool result = handle_in_head(parser, token);
+    gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
+            (token->type == GUMBO_TOKEN_END_TAG &&
+             !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+                     GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
+    state->_reprocess_current_token = true;
+    return true;
+  }
+}
+
+static void destroy_node(GumboParser* parser, GumboNode* node) {
+  switch (node->type) {
+    case GUMBO_NODE_DOCUMENT:
+      {
+        GumboDocument* doc = &node->v.document;
+        for (int i = 0; i < doc->children.length; ++i) {
+          destroy_node(parser, doc->children.data[i]);
+        }
+        gumbo_parser_deallocate(parser, (void*) doc->children.data);
+        gumbo_parser_deallocate(parser, (void*) doc->name);
+        gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
+        gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
+      }
+      break;
+    case GUMBO_NODE_ELEMENT:
+      for (int i = 0; i < node->v.element.attributes.length; ++i) {
+        gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
+      }
+      gumbo_parser_deallocate(parser, node->v.element.attributes.data);
+      for (int i = 0; i < node->v.element.children.length; ++i) {
+        destroy_node(parser, node->v.element.children.data[i]);
+      }
+      gumbo_parser_deallocate(parser, node->v.element.children.data);
+      break;
+    case GUMBO_NODE_TEXT:
+    case GUMBO_NODE_CDATA:
+    case GUMBO_NODE_COMMENT:
+    case GUMBO_NODE_WHITESPACE:
+      gumbo_parser_deallocate(parser, (void*) node->v.text.text);
+      break;
+  }
+  gumbo_parser_deallocate(parser, node);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
+static bool handle_in_body(GumboParser* parser, GumboToken* token) {
+  GumboParserState* state = parser->_parser_state;
+  assert(state->_open_elements.length > 0);
+  if (token->type == GUMBO_TOKEN_NULL) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    reconstruct_active_formatting_elements(parser);
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_CHARACTER) {
+    reconstruct_active_formatting_elements(parser);
+    insert_text_token(parser, token);
+    set_frameset_not_ok(parser);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    assert(parser->_output->root != NULL);
+    assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
+    add_parse_error(parser, token);
+    merge_attributes(parser, token, parser->_output->root);
+    return false;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
+                    GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
+                    GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
+                    GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
+    return handle_in_head(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
+    add_parse_error(parser, token);
+    if (state->_open_elements.length < 2 ||
+        !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
+      ignore_token(parser);
+      return false;
+    }
+    state->_frameset_ok = false;
+    merge_attributes(parser, token, state->_open_elements.data[1]);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
+    add_parse_error(parser, token);
+    if (state->_open_elements.length < 2 ||
+        !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
+        !state->_frameset_ok) {
+      ignore_token(parser);
+      return false;
+    }
+    // Save the body node for later removal.
+    GumboNode* body_node = state->_open_elements.data[1];
+
+    // Pop all nodes except root HTML element.
+    GumboNode* node;
+    do {
+      node = pop_current_node(parser);
+    } while (node != state->_open_elements.data[1]);
+
+    // Remove the body node.  We may want to factor this out into a generic
+    // helper, but right now this is the only code that needs to do this.
+    GumboVector* children = &parser->_output->root->v.element.children;
+    for (int i = 0; i < children->length; ++i) {
+      if (children->data[i] == body_node) {
+        gumbo_vector_remove_at(parser, i, children);
+        break;
+      }
+    }
+    destroy_node(parser, body_node);
+
+    // Insert the <frameset>, and switch the insertion mode.
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    for (int i = 0; i < state->_open_elements.length; ++i) {
+      if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
+                       GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
+                       GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
+                       GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
+                       GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
+        add_parse_error(parser, token);
+        return false;
+      }
+    }
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+                    GUMBO_TAG_LAST)) {
+    if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    bool success = true;
+    for (int i = 0; i < state->_open_elements.length; ++i) {
+      if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
+                       GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
+                       GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
+                       GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
+                       GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
+                       GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
+                       GUMBO_TAG_LAST)) {
+        add_parse_error(parser, token);
+        success = false;
+        break;
+      }
+    }
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
+    if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
+      parser->_parser_state->_reprocess_current_token = true;
+    } else {
+      GumboNode* body = state->_open_elements.data[1];
+      assert(node_tag_is(body, GUMBO_TAG_BODY));
+      record_end_of_element(state->_current_token, &body->v.element);
+    }
+    return success;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
+                    GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
+                    GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
+                    GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
+                    GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
+                    GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
+                    GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
+                    GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    return result;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
+                    GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
+                    GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
+                    GUMBO_TAG_LAST)) {
+      add_parse_error(parser, token);
+      pop_current_node(parser);
+      result = false;
+    }
+    insert_element_from_token(parser, token);
+    return result;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
+                    GUMBO_TAG_LAST)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    state->_ignore_next_linefeed = true;
+    state->_frameset_ok = false;
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
+    if (state->_form_element != NULL) {
+      gumbo_debug("Ignoring nested form.\n");
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    state->_form_element =
+        insert_element_from_token(parser, token);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
+    maybe_implicitly_close_list_tag(parser, token, true);
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    return result;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
+                    GUMBO_TAG_LAST)) {
+    maybe_implicitly_close_list_tag(parser, token, false);
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
+    if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
+      add_parse_error(parser, token);
+      implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
+      state->_reprocess_current_token = true;
+      return false;
+    }
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    state->_frameset_ok = false;
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
+                    GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
+                    GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
+                    GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
+                    GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
+                    GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
+                    GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
+                    GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
+                    GUMBO_TAG_LAST)) {
+    GumboTag tag = token->v.end_tag;
+    if (!has_an_element_in_scope(parser, tag)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    implicitly_close_tags(parser, token, token->v.end_tag);
+    return true;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
+    bool result = true;
+    const GumboNode* node = state->_form_element;
+    assert(!node || node->type == GUMBO_NODE_ELEMENT);
+    state->_form_element = NULL;
+    if (!node || !has_node_in_scope(parser, node)) {
+      gumbo_debug("Closing an unopened form.\n");
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    // This differs from implicitly_close_tags because we remove *only* the
+    // <form> element; other nodes are left in scope.
+    generate_implied_end_tags(parser, GUMBO_TAG_LAST);
+    if (get_current_node(parser) != node) {
+      add_parse_error(parser, token);
+      result = false;
+    }
+
+    GumboVector* open_elements = &state->_open_elements;
+    int index = open_elements->length - 1;
+    for (; index >= 0 && open_elements->data[index] != node; --index);
+    assert(index >= 0);
+    gumbo_vector_remove_at(parser, index, open_elements);
+    return result;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
+    if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
+      add_parse_error(parser, token);
+      reconstruct_active_formatting_elements(parser);
+      insert_element_of_tag_type(
+          parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
+      state->_reprocess_current_token = true;
+      return false;
+    }
+    return implicitly_close_tags(parser, token, GUMBO_TAG_P);
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
+    if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
+                    GUMBO_TAG_LAST)) {
+    assert(token->type == GUMBO_TOKEN_END_TAG);
+    GumboTag token_tag = token->v.end_tag;
+    if (!has_an_element_in_scope(parser, token_tag)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    return implicitly_close_tags(parser, token, token_tag);
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
+                    GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
+    if (!has_an_element_in_scope_with_tagname(
+            parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
+            GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
+      // No heading open; ignore the token entirely.
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    } else {
+      generate_implied_end_tags(parser, GUMBO_TAG_LAST);
+      const GumboNode* current_node = get_current_node(parser);
+      bool success = node_tag_is(current_node, token->v.end_tag);
+      if (!success) {
+        // There're children of the heading currently open; close them below and
+        // record a parse error.
+        // TODO(jdtang): Add a way to distinguish this error case from the one
+        // above.
+        add_parse_error(parser, token);
+      }
+      do {
+        current_node = pop_current_node(parser);
+      } while (!node_tag_in(current_node, GUMBO_TAG_H1, GUMBO_TAG_H2,
+                            GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5,
+                            GUMBO_TAG_H6, GUMBO_TAG_LAST));
+      return success;
+    }
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
+    bool success = true;
+    int last_a;
+    int has_matching_a = find_last_anchor_index(parser, &last_a);
+    if (has_matching_a) {
+      assert(has_matching_a == 1);
+      add_parse_error(parser, token);
+      adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
+      // The adoption agency algorithm usually removes all instances of <a>
+      // from the list of active formatting elements, but in case it doesn't,
+      // we're supposed to do this.  (The conditions where it might not are
+      // listed in the spec.)
+      if (find_last_anchor_index(parser, &last_a)) {
+        void* last_element = gumbo_vector_remove_at(
+            parser, last_a, &state->_active_formatting_elements);
+        gumbo_vector_remove(
+            parser, last_element, &state->_open_elements);
+      }
+      success = false;
+    }
+    reconstruct_active_formatting_elements(parser);
+    add_formatting_element(parser, insert_element_from_token(parser, token));
+    return success;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
+                    GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
+                    GUMBO_TAG_S, GUMBO_TAG_SMALL, GUMBO_TAG_STRIKE,
+                    GUMBO_TAG_STRONG, GUMBO_TAG_TT, GUMBO_TAG_U,
+                    GUMBO_TAG_LAST)) {
+    reconstruct_active_formatting_elements(parser);
+    add_formatting_element(parser, insert_element_from_token(parser, token));
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
+    bool result = true;
+    reconstruct_active_formatting_elements(parser);
+    if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
+      result = false;
+      add_parse_error(parser, token);
+      adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
+      reconstruct_active_formatting_elements(parser);
+    }
+    insert_element_from_token(parser, token);
+    add_formatting_element(parser, get_current_node(parser));
+    return result;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_A, GUMBO_TAG_B, GUMBO_TAG_BIG,
+                    GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
+                    GUMBO_TAG_NOBR, GUMBO_TAG_S, GUMBO_TAG_SMALL,
+                    GUMBO_TAG_STRIKE, GUMBO_TAG_STRONG, GUMBO_TAG_TT,
+                    GUMBO_TAG_U, GUMBO_TAG_LAST)) {
+    return adoption_agency_algorithm(parser, token, token->v.end_tag);
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
+                    GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    add_formatting_element(parser, &kActiveFormattingScopeMarker);
+    set_frameset_not_ok(parser);
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
+                    GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
+    GumboTag token_tag = token->v.end_tag;
+    if (!has_an_element_in_table_scope(parser, token_tag)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    implicitly_close_tags(parser, token, token_tag);
+    clear_active_formatting_elements(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
+    if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
+        GUMBO_DOCTYPE_QUIRKS) {
+      maybe_implicitly_close_p_tag(parser, token);
+    }
+    insert_element_from_token(parser, token);
+    set_frameset_not_ok(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_AREA, GUMBO_TAG_BR,
+                    GUMBO_TAG_EMBED, GUMBO_TAG_IMG, GUMBO_TAG_IMAGE,
+                    GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
+    bool success = true;
+    if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
+      success = false;
+      add_parse_error(parser, token);
+      token->v.start_tag.tag = GUMBO_TAG_IMG;
+    }
+    reconstruct_active_formatting_elements(parser);
+    GumboNode* node = insert_element_from_token(parser, token);
+    if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
+      success = false;
+      add_parse_error(parser, token);
+      node->v.element.tag = GUMBO_TAG_IMG;
+      node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
+    }
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    set_frameset_not_ok(parser);
+    return success;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
+    if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
+      // Must be before the element is inserted, as that takes ownership of the
+      // token's attribute vector.
+      set_frameset_not_ok(parser);
+    }
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_PARAM, GUMBO_TAG_SOURCE,
+                    GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    set_frameset_not_ok(parser);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
+    add_parse_error(parser, token);
+    if (parser->_parser_state->_form_element != NULL) {
+      ignore_token(parser);
+      return false;
+    }
+    acknowledge_self_closing_tag(parser);
+    maybe_implicitly_close_p_tag(parser, token);
+    set_frameset_not_ok(parser);
+
+    GumboVector* token_attrs = &token->v.start_tag.attributes;
+    GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
+    GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
+    GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "isindex");
+
+    GumboNode* form = insert_element_of_tag_type(
+        parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
+    if (action_attr) {
+      gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
+    }
+    insert_element_of_tag_type(parser, GUMBO_TAG_HR,
+                               GUMBO_INSERTION_FROM_ISINDEX);
+    pop_current_node(parser);   // <hr>
+
+    insert_element_of_tag_type(parser, GUMBO_TAG_LABEL,
+                               GUMBO_INSERTION_FROM_ISINDEX);
+    TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
+    text_state->_start_original_text = token->original_text.data;
+    text_state->_start_position = token->position;
+    text_state->_type = GUMBO_NODE_TEXT;
+    if (prompt_attr) {
+      int prompt_attr_length = strlen(prompt_attr->value);
+      gumbo_string_buffer_destroy(parser, &text_state->_buffer);
+      text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
+      text_state->_buffer.length = prompt_attr_length;
+      text_state->_buffer.capacity = prompt_attr_length + 1;
+      gumbo_destroy_attribute(parser, prompt_attr);
+    } else {
+      GumboStringPiece prompt_text = GUMBO_STRING(
+          "This is a searchable index. Enter search keywords: ");
+      gumbo_string_buffer_append_string(
+          parser, &prompt_text, &text_state->_buffer);
+    }
+
+    GumboNode* input = insert_element_of_tag_type(
+        parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
+    for (int i = 0; i < token_attrs->length; ++i) {
+      GumboAttribute* attr = token_attrs->data[i];
+      if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
+        gumbo_vector_add(parser, attr, &input->v.element.attributes);
+      }
+      token_attrs->data[i] = NULL;
+    }
+
+    // All attributes have been successfully transferred and nulled out at this
+    // point, so the call to ignore_token will free the memory for it without
+    // touching the attributes.
+    ignore_token(parser);
+
+    GumboAttribute* name =
+        gumbo_parser_allocate(parser, sizeof(GumboAttribute));
+    GumboStringPiece name_str = GUMBO_STRING("name");
+    GumboStringPiece isindex_str = GUMBO_STRING("isindex");
+    name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
+    name->name = gumbo_copy_stringz(parser, "name");
+    name->value = gumbo_copy_stringz(parser, "isindex");
+    name->original_name = name_str;
+    name->original_value = isindex_str;
+    name->name_start = kGumboEmptySourcePosition;
+    name->name_end = kGumboEmptySourcePosition;
+    name->value_start = kGumboEmptySourcePosition;
+    name->value_end = kGumboEmptySourcePosition;
+    gumbo_vector_add(parser, name, &input->v.element.attributes);
+
+    pop_current_node(parser);   // <input>
+    pop_current_node(parser);   // <label>
+    insert_element_of_tag_type(
+        parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
+    pop_current_node(parser);   // <hr>
+    pop_current_node(parser);   // <form>
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
+    parser->_parser_state->_ignore_next_linefeed = true;
+    set_frameset_not_ok(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
+    bool result = maybe_implicitly_close_p_tag(parser, token);
+    reconstruct_active_formatting_elements(parser);
+    set_frameset_not_ok(parser);
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
+    return result;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
+    set_frameset_not_ok(parser);
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
+    run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    set_frameset_not_ok(parser);
+    GumboInsertionMode state = parser->_parser_state->_insertion_mode;
+    if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
+        state == GUMBO_INSERTION_MODE_IN_CAPTION ||
+        state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
+        state == GUMBO_INSERTION_MODE_IN_ROW ||
+        state == GUMBO_INSERTION_MODE_IN_CELL) {
+      set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
+    } else {
+      set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
+    }
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
+                    GUMBO_TAG_LAST)) {
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
+      pop_current_node(parser);
+    }
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_RP, GUMBO_TAG_RT,
+                    GUMBO_TAG_LAST)) {
+    bool success = true;
+    if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
+      generate_implied_end_tags(parser, GUMBO_TAG_LAST);
+    }
+    if (!node_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
+      add_parse_error(parser, token);
+      success = false;
+    }
+    insert_element_from_token(parser, token);
+    return success;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
+    add_parse_error(parser, token);
+    reconstruct_active_formatting_elements(parser);
+    insert_element_of_tag_type(
+        parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
+    pop_current_node(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
+    reconstruct_active_formatting_elements(parser);
+    adjust_mathml_attributes(parser, token);
+    adjust_foreign_attributes(parser, token);
+    insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
+    if (token->v.start_tag.is_self_closing) {
+      pop_current_node(parser);
+      acknowledge_self_closing_tag(parser);
+    }
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
+    reconstruct_active_formatting_elements(parser);
+    adjust_svg_attributes(parser, token);
+    adjust_foreign_attributes(parser, token);
+    insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
+    if (token->v.start_tag.is_self_closing) {
+      pop_current_node(parser);
+      acknowledge_self_closing_tag(parser);
+    }
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
+                    GUMBO_TAG_COLGROUP, GUMBO_TAG_FRAME, GUMBO_TAG_HEAD,
+                    GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
+                    GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
+                    GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_START_TAG) {
+    reconstruct_active_formatting_elements(parser);
+    insert_element_from_token(parser, token);
+    return true;
+  } else {
+    assert(token->type == GUMBO_TOKEN_END_TAG);
+    GumboTag end_tag = token->v.end_tag;
+    assert(state->_open_elements.length > 0);
+    assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
+    // Walk up the stack of open elements until we find one that either:
+    // a) Matches the tag name we saw
+    // b) Is in the "special" category.
+    // If we see a), implicitly close everything up to and including it.  If we
+    // see b), then record a parse error, don't close anything (except the
+    // implied end tags) and ignore the end tag token.
+    for (int i = state->_open_elements.length; --i >= 0; ) {
+      const GumboNode* node = state->_open_elements.data[i];
+      if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
+          node_tag_is(node, end_tag)) {
+        generate_implied_end_tags(parser, end_tag);
+        // TODO(jdtang): Do I need to add a parse error here?  The condition in
+        // the spec seems like it's the inverse of the loop condition above, and
+        // so would never fire.
+        while (node != pop_current_node(parser));  // Pop everything.
+        return true;
+      } else if (is_special_node(node)) {
+        add_parse_error(parser, token);
+        ignore_token(parser);
+        return false;
+      }
+    }
+    // <html> is in the special category, so we should never get here.
+    assert(0);
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
+static bool handle_text(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_CHARACTER || token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+  } else {
+    // We provide only bare-bones script handling that doesn't involve any of
+    // the parser-pause/already-started/script-nesting flags or re-entrant
+    // invocations of the tokenizer.  Because the intended usage of this library
+    // is mostly for templating, refactoring, and static-analysis libraries, we
+    // provide the script body as a text-node child of the <script> element.
+    // This behavior doesn't support document.write of partial HTML elements,
+    // but should be adequate for almost all other scripting support.
+    if (token->type == GUMBO_TOKEN_EOF) {
+      add_parse_error(parser, token);
+      parser->_parser_state->_reprocess_current_token = true;
+    }
+    pop_current_node(parser);
+    set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
+  }
+  return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
+static bool handle_in_table(GumboParser* parser, GumboToken* token) {
+  GumboParserState* state = parser->_parser_state;
+  if (token->type == GUMBO_TOKEN_CHARACTER ||
+      token->type == GUMBO_TOKEN_WHITESPACE) {
+    // The "pending table character tokens" list described in the spec is
+    // nothing more than the TextNodeBufferState.  We accumulate text tokens as
+    // normal, except that when we go to flush them in the handle_in_table_text,
+    // we set _foster_parent_insertions if there're non-whitespace characters in
+    // the buffer.
+    assert(state->_text_node._buffer.length == 0);
+    state->_original_insertion_mode = state->_insertion_mode;
+    state->_reprocess_current_token = true;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
+    clear_stack_to_table_context(parser);
+    add_formatting_element(parser, &kActiveFormattingScopeMarker);
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
+    clear_stack_to_table_context(parser);
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
+    clear_stack_to_table_context(parser);
+    insert_element_of_tag_type(
+        parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
+    parser->_parser_state->_reprocess_current_token = true;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
+                    GUMBO_TAG_THEAD, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
+                    GUMBO_TAG_LAST)) {
+    clear_stack_to_table_context(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
+    if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
+               GUMBO_TAG_LAST)) {
+      insert_element_of_tag_type(
+          parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
+      state->_reprocess_current_token = true;
+    } else {
+      insert_element_from_token(parser, token);
+    }
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
+    add_parse_error(parser, token);
+    if (close_table(parser)) {
+      parser->_parser_state->_reprocess_current_token = true;
+    } else {
+      ignore_token(parser);
+    }
+    return false;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
+    if (!close_table(parser)) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
+                    GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
+                    GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
+                    GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
+                    GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT,
+                    GUMBO_TAG_LAST)) {
+    return handle_in_head(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
+             attribute_matches(&token->v.start_tag.attributes,
+                               "type", "hidden")) {
+    add_parse_error(parser, token);
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
+    add_parse_error(parser, token);
+    if (state->_form_element) {
+      ignore_token(parser);
+      return false;
+    }
+    state->_form_element = insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    state->_foster_parent_insertions = true;
+    bool result = handle_in_body(parser, token);
+    state->_foster_parent_insertions = false;
+    return result;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
+static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_NULL) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_CHARACTER ||
+             token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else {
+    GumboParserState* state = parser->_parser_state;
+    GumboStringBuffer* buffer = &state->_text_node._buffer;
+    // Can't use strspn for this because GumboStringBuffers are not
+    // null-terminated.
+    // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
+    // of any one byte that is not whitespace means we flip the flag, so this
+    // loop is still valid.
+    for (int i = 0; i < buffer->length; ++i) {
+      if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
+        state->_foster_parent_insertions = true;
+        reconstruct_active_formatting_elements(parser);
+        break;
+      }
+    }
+    maybe_flush_text_node_buffer(parser);
+    state->_foster_parent_insertions = false;
+    state->_reprocess_current_token = true;
+    state->_insertion_mode = state->_original_insertion_mode;
+    return true;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
+static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
+  if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
+                    GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
+                    GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
+                    GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
+             tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
+                    GUMBO_TAG_LAST)) {
+    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    if (!tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
+      add_parse_error(parser, token);
+      parser->_parser_state->_reprocess_current_token = true;
+    }
+    generate_implied_end_tags(parser, GUMBO_TAG_LAST);
+    bool result = true;
+    if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
+      add_parse_error(parser, token);
+      while (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
+        pop_current_node(parser);
+      }
+      result = false;
+    }
+    pop_current_node(parser);  // The <caption> itself.
+    clear_active_formatting_elements(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
+    return result;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_COL,
+                    GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML, GUMBO_TAG_TBODY,
+                    GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
+                    GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    return handle_in_body(parser, token);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
+static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    return true;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_EOF &&
+             get_current_node(parser) == parser->_output->root) {
+    return true;
+  } else {
+    if (get_current_node(parser) == parser->_output->root) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
+    pop_current_node(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
+    if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
+      parser->_parser_state->_reprocess_current_token = true;
+    }
+    return true;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
+static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
+  if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
+    clear_stack_to_table_body_context(parser);
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH,
+                    GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    clear_stack_to_table_body_context(parser);
+    insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
+    parser->_parser_state->_reprocess_current_token = true;
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
+    return false;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
+                    GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
+    if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    clear_stack_to_table_body_context(parser);
+    pop_current_node(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
+                    GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
+                    GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
+             tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
+    if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
+          has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
+          has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    clear_stack_to_table_body_context(parser);
+    pop_current_node(parser);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
+    parser->_parser_state->_reprocess_current_token = true;
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
+                    GUMBO_TAG_COL, GUMBO_TAG_TR, GUMBO_TAG_COLGROUP,
+                    GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
+  {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    return handle_in_table(parser, token);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
+static bool handle_in_row(GumboParser* parser, GumboToken* token) {
+  if (tag_in(token, kStartTag, GUMBO_TAG_TH, GUMBO_TAG_TD, GUMBO_TAG_LAST)) {
+    clear_stack_to_table_row_context(parser);
+    insert_element_from_token(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
+    add_formatting_element(parser, &kActiveFormattingScopeMarker);
+    return true;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP,
+                    GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
+                    GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
+             tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
+                    GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
+                    GUMBO_TAG_LAST)) {
+    // This case covers 4 clauses of the spec, each of which say "Otherwise, act
+    // as if an end tag with the tag name "tr" had been seen."  The differences
+    // are in error handling and whether the current token is reprocessed.
+    GumboTag desired_tag =
+        tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
+               GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
+        ? token->v.end_tag : GUMBO_TAG_TR;
+    if (!has_an_element_in_table_scope(parser, desired_tag)) {
+      gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
+                 gumbo_normalized_tagname(desired_tag));
+      for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
+        const GumboNode* node = parser->_parser_state->_open_elements.data[i];
+        gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
+      }
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    clear_stack_to_table_row_context(parser);
+    GumboNode* last_element = pop_current_node(parser);
+    assert(node_tag_is(last_element, GUMBO_TAG_TR));
+    AVOID_UNUSED_VARIABLE_WARNING(last_element);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
+    if (!tag_is(token, kEndTag, GUMBO_TAG_TR)) {
+      parser->_parser_state->_reprocess_current_token = true;
+    }
+    return true;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
+                    GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
+                    GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else {
+    return handle_in_table(parser, token);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
+static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
+  if (tag_in(token, kEndTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
+    GumboTag token_tag = token->v.end_tag;
+    if (!has_an_element_in_table_scope(parser, token_tag)) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    return close_table_cell(parser, token, token_tag);
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
+                    GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
+                    GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
+                    GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
+    gumbo_debug("Handling <td> in cell.\n");
+    if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
+        !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
+      gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    parser->_parser_state->_reprocess_current_token = true;
+    return close_current_cell(parser, token);
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
+                    GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
+                    GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
+                    GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
+                    GUMBO_TAG_LAST)) {
+    if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    parser->_parser_state->_reprocess_current_token = true;
+    return close_current_cell(parser, token);
+  } else {
+    return handle_in_body(parser, token);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
+static bool handle_in_select(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_NULL) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_CHARACTER ||
+             token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
+      pop_current_node(parser);
+    }
+    insert_element_from_token(parser, token);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
+      pop_current_node(parser);
+    }
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
+      pop_current_node(parser);
+    }
+    insert_element_from_token(parser, token);
+    return true;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
+    GumboVector* open_elements = &parser->_parser_state->_open_elements;
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
+        node_tag_is(open_elements->data[open_elements->length - 2],
+                    GUMBO_TAG_OPTGROUP)) {
+      pop_current_node(parser);
+    }
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
+      pop_current_node(parser);
+      return true;
+    } else {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
+      pop_current_node(parser);
+      return true;
+    } else {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
+    if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    close_current_select(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    close_current_select(parser);
+    return false;
+  } else if (tag_in(token, kStartTag, GUMBO_TAG_INPUT, GUMBO_TAG_KEYGEN,
+                    GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
+      ignore_token(parser);
+    } else {
+      close_current_select(parser);
+      parser->_parser_state->_reprocess_current_token = true;
+    }
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
+    return handle_in_head(parser, token);
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    if (get_current_node(parser) != parser->_output->root) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
+static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
+  if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
+             GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
+             GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    close_current_select(parser);
+    parser->_parser_state->_reprocess_current_token = true;
+    return false;
+  } else if (tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
+                    GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
+                    GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
+    add_parse_error(parser, token);
+    if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
+      close_current_select(parser);
+      reset_insertion_mode_appropriately(parser);
+      parser->_parser_state->_reprocess_current_token = true;
+    } else {
+      ignore_token(parser);
+    }
+    return false;
+  } else {
+    return handle_in_select(parser, token);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
+static bool handle_in_template(GumboParser* parser, GumboToken* token) {
+  // TODO(jdtang): Implement this.
+  return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
+static bool handle_after_body(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_WHITESPACE ||
+      tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    GumboNode* html_node = parser->_output->root;
+    assert(html_node != NULL);
+    append_comment_node(parser, html_node, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
+    // TODO(jdtang): Handle fragment parsing algorithm case.
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
+    GumboNode* html = parser->_parser_state->_open_elements.data[0];
+    assert(node_tag_is(html, GUMBO_TAG_HTML));
+    record_end_of_element(
+        parser->_parser_state->_current_token, &html->v.element);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
+    parser->_parser_state->_reprocess_current_token = true;
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
+static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
+    insert_element_from_token(parser, token);
+    return true;
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
+    if (node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    }
+    pop_current_node(parser);
+    // TODO(jdtang): Add a condition to ignore this for the fragment parsing
+    // algorithm.
+    if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
+      set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
+    }
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
+    insert_element_from_token(parser, token);
+    pop_current_node(parser);
+    acknowledge_self_closing_tag(parser);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
+    return handle_in_head(parser, token);
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
+      add_parse_error(parser, token);
+      return false;
+    }
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
+static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_WHITESPACE) {
+    insert_text_token(parser, token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_current_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
+    return handle_in_head(parser, token);
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
+static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_document_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
+             token->type == GUMBO_TOKEN_WHITESPACE ||
+             tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    return true;
+  } else {
+    add_parse_error(parser, token);
+    set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
+    parser->_parser_state->_reprocess_current_token = true;
+    return false;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
+static bool handle_after_after_frameset(
+    GumboParser* parser, GumboToken* token) {
+  if (token->type == GUMBO_TOKEN_COMMENT) {
+    append_comment_node(parser, get_document_node(parser), token);
+    return true;
+  } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
+             token->type == GUMBO_TOKEN_WHITESPACE ||
+             tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
+    return handle_in_body(parser, token);
+  } else if (token->type == GUMBO_TOKEN_EOF) {
+    return true;
+  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
+    return handle_in_head(parser, token);
+  } else {
+    add_parse_error(parser, token);
+    ignore_token(parser);
+    return false;
+  }
+}
+
+// Function pointers for each insertion mode.  Keep in sync with
+// insertion_mode.h.
+typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
+static const TokenHandler kTokenHandlers[] = {
+  handle_initial,
+  handle_before_html,
+  handle_before_head,
+  handle_in_head,
+  handle_in_head_noscript,
+  handle_after_head,
+  handle_in_body,
+  handle_text,
+  handle_in_table,
+  handle_in_table_text,
+  handle_in_caption,
+  handle_in_column_group,
+  handle_in_table_body,
+  handle_in_row,
+  handle_in_cell,
+  handle_in_select,
+  handle_in_select_in_table,
+  handle_in_template,
+  handle_after_body,
+  handle_in_frameset,
+  handle_after_frameset,
+  handle_after_after_body,
+  handle_after_after_frameset
+};
+
+static bool handle_html_content(GumboParser* parser, GumboToken* token) {
+  return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
+      parser, token);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
+static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
+  switch (token->type) {
+    case GUMBO_TOKEN_NULL:
+      add_parse_error(parser, token);
+      token->type = GUMBO_TOKEN_CHARACTER;
+      token->v.character = kUtf8ReplacementChar;
+      insert_text_token(parser, token);
+      return false;
+    case GUMBO_TOKEN_WHITESPACE:
+      insert_text_token(parser, token);
+      return true;
+    case GUMBO_TOKEN_CHARACTER:
+      insert_text_token(parser, token);
+      set_frameset_not_ok(parser);
+      return true;
+    case GUMBO_TOKEN_COMMENT:
+      append_comment_node(parser, get_current_node(parser), token);
+      return true;
+    case GUMBO_TOKEN_DOCTYPE:
+      add_parse_error(parser, token);
+      ignore_token(parser);
+      return false;
+    default:
+      // Fall through to the if-statements below.
+      break;
+  }
+  // Order matters for these clauses.
+  if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
+             GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BODY, GUMBO_TAG_BR,
+             GUMBO_TAG_CENTER, GUMBO_TAG_CODE, GUMBO_TAG_DD, GUMBO_TAG_DIV,
+             GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EM, GUMBO_TAG_EMBED,
+             GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
+             GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD, GUMBO_TAG_HR,
+             GUMBO_TAG_I, GUMBO_TAG_IMG, GUMBO_TAG_LI, GUMBO_TAG_LISTING,
+             GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NOBR, GUMBO_TAG_OL,
+             GUMBO_TAG_P, GUMBO_TAG_PRE, GUMBO_TAG_RUBY, GUMBO_TAG_S,
+             GUMBO_TAG_SMALL, GUMBO_TAG_SPAN, GUMBO_TAG_STRONG,
+             GUMBO_TAG_STRIKE, GUMBO_TAG_SUB, GUMBO_TAG_SUP,
+             GUMBO_TAG_TABLE, GUMBO_TAG_TT, GUMBO_TAG_U, GUMBO_TAG_UL,
+             GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
+     (tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
+         token_has_attribute(token, "color") ||
+         token_has_attribute(token, "face") ||
+         token_has_attribute(token, "size")))) {
+    add_parse_error(parser, token);
+    do {
+      pop_current_node(parser);
+    } while(!(is_mathml_integration_point(get_current_node(parser)) ||
+              is_html_integration_point(get_current_node(parser)) ||
+              get_current_node(parser)->v.element.tag_namespace ==
+              GUMBO_NAMESPACE_HTML));
+    parser->_parser_state->_reprocess_current_token = true;
+    return false;
+  } else if (token->type == GUMBO_TOKEN_START_TAG) {
+    const GumboNamespaceEnum current_namespace =
+        get_current_node(parser)->v.element.tag_namespace;
+    if (current_namespace == GUMBO_NAMESPACE_MATHML) {
+      adjust_mathml_attributes(parser, token);
+    }
+    if (current_namespace == GUMBO_NAMESPACE_SVG) {
+      // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
+      // function.
+      adjust_svg_attributes(parser, token);
+    }
+    adjust_foreign_attributes(parser, token);
+    insert_foreign_element(parser, token, current_namespace);
+    if (token->v.start_tag.is_self_closing) {
+      pop_current_node(parser);
+      acknowledge_self_closing_tag(parser);
+    }
+    return true;
+  // </script> tags are handled like any other end tag, putting the script's
+  // text into a text node child and closing the current node.
+  } else {
+    assert(token->type == GUMBO_TOKEN_END_TAG);
+    GumboNode* node = get_current_node(parser);
+    assert(node != NULL);
+    GumboStringPiece token_tagname = token->original_text;
+    GumboStringPiece node_tagname = node->v.element.original_tag;
+    gumbo_tag_from_original_text(&token_tagname);
+    gumbo_tag_from_original_text(&node_tagname);
+
+    bool is_success = true;
+    if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
+      add_parse_error(parser, token);
+      is_success = false;
+    }
+    int i = parser->_parser_state->_open_elements.length;
+    for( --i; i > 0; ) {
+      // Here we move up the stack until we find an HTML element (in which
+      // case we do nothing) or we find the element that we're about to
+      // close (in which case we pop everything we've seen until that
+      // point.)
+      gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
+                  node_tagname.data, i);
+      if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
+        gumbo_debug("Matches.\n");
+        while (pop_current_node(parser) != node) {
+          // Pop all the nodes below the current one.  Node is guaranteed to
+          // be an element on the stack of open elements (set below), so
+          // this loop is guaranteed to terminate.
+        }
+        return is_success;
+      }
+      --i;
+      node = parser->_parser_state->_open_elements.data[i];
+      if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
+        // Must break before gumbo_tag_from_original_text to avoid passing
+        // parser-inserted nodes through.
+        break;
+      }
+      node_tagname = node->v.element.original_tag;
+      gumbo_tag_from_original_text(&node_tagname);
+    }
+    assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
+    // We can't call handle_token directly because the current node is still in
+    // the SVG namespace, so it would re-enter this and result in infinite
+    // recursion.
+    return handle_html_content(parser, token) && is_success;
+  }
+}
+
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
+static bool handle_token(GumboParser* parser, GumboToken* token) {
+  if (parser->_parser_state->_ignore_next_linefeed &&
+      token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
+    parser->_parser_state->_ignore_next_linefeed = false;
+    ignore_token(parser);
+    return true;
+  }
+  // This needs to be reset both here and in the conditional above to catch both
+  // the case where the next token is not whitespace (so we don't ignore
+  // whitespace in the middle of <pre> tags) and where there are multiple
+  // whitespace tokens (so we don't ignore the second one).
+  parser->_parser_state->_ignore_next_linefeed = false;
+
+  if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
+    parser->_parser_state->_closed_body_tag = true;
+  }
+  if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
+    parser->_parser_state->_closed_html_tag = true;
+  }
+
+  const GumboNode* current_node = get_current_node(parser);
+  assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT);
+  if (current_node) {
+    gumbo_debug("Current node: <%s>.\n",
+                gumbo_normalized_tagname(current_node->v.element.tag));
+  }
+  if (!current_node ||
+      current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
+      (is_mathml_integration_point(current_node) &&
+       (token->type == GUMBO_TOKEN_CHARACTER ||
+        token->type == GUMBO_TOKEN_WHITESPACE ||
+        token->type == GUMBO_TOKEN_NULL ||
+        (token->type == GUMBO_TOKEN_START_TAG &&
+         !tag_in(token, kStartTag, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK,
+                GUMBO_TAG_LAST)))) ||
+      (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
+       node_tag_is(current_node, GUMBO_TAG_ANNOTATION_XML) &&
+       tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
+      (is_html_integration_point(current_node) && (
+          token->type == GUMBO_TOKEN_START_TAG ||
+          token->type == GUMBO_TOKEN_CHARACTER ||
+          token->type == GUMBO_TOKEN_NULL ||
+          token->type == GUMBO_TOKEN_WHITESPACE)) ||
+      token->type == GUMBO_TOKEN_EOF) {
+    return handle_html_content(parser, token);
+  } else {
+    return handle_in_foreign_content(parser, token);
+  }
+}
+
+GumboOutput* gumbo_parse(const char* buffer) {
+  return gumbo_parse_with_options(
+      &kGumboDefaultOptions, buffer, strlen(buffer));
+}
+
+GumboOutput* gumbo_parse_with_options(
+    const GumboOptions* options, const char* buffer, size_t length) {
+  GumboParser parser;
+  parser._options = options;
+  output_init(&parser);
+  gumbo_tokenizer_state_init(&parser, buffer, length);
+  parser_state_init(&parser);
+
+  GumboParserState* state = parser._parser_state;
+  gumbo_debug("Parsing %.*s.\n", length, buffer);
+
+  // Sanity check so that infinite loops die with an assertion failure instead
+  // of hanging the process before we ever get an error.
+  int loop_count = 0;
+
+  GumboToken token;
+  bool has_error = false;
+  do {
+    if (state->_reprocess_current_token) {
+      state->_reprocess_current_token = false;
+    } else {
+      GumboNode* current_node = get_current_node(&parser);
+      gumbo_tokenizer_set_is_current_node_foreign(
+          &parser, current_node &&
+          current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
+      has_error = !gumbo_lex(&parser, &token) || has_error;
+    }
+    const char* token_type = "text";
+    switch (token.type) {
+      case GUMBO_TOKEN_DOCTYPE:
+        token_type = "doctype";
+        break;
+      case GUMBO_TOKEN_START_TAG:
+        token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
+        break;
+      case GUMBO_TOKEN_END_TAG:
+        token_type = gumbo_normalized_tagname(token.v.end_tag);
+        break;
+      case GUMBO_TOKEN_COMMENT:
+        token_type = "comment";
+        break;
+      default:
+        break;
+    }
+    gumbo_debug("Handling %s token @%d:%d in state %d.\n",
+               (char*) token_type, token.position.line, token.position.column,
+               state->_insertion_mode);
+
+    state->_current_token = &token;
+    state->_self_closing_flag_acknowledged =
+        !(token.type == GUMBO_TOKEN_START_TAG &&
+          token.v.start_tag.is_self_closing);
+
+    has_error = !handle_token(&parser, &token) || has_error;
+
+    // Check for memory leaks when ownership is transferred from start tag
+    // tokens to nodes.
+    assert(state->_reprocess_current_token ||
+           token.type != GUMBO_TOKEN_START_TAG ||
+           token.v.start_tag.attributes.data == NULL);
+
+    if (!state->_self_closing_flag_acknowledged) {
+      GumboError* error = add_parse_error(&parser, &token);
+      if (error) {
+        error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
+      }
+    }
+
+    ++loop_count;
+    assert(loop_count < 1000000000);
+
+  } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
+           !(options->stop_on_first_error && has_error));
+
+  finish_parsing(&parser);
+  // For API uniformity reasons, if the doctype still has nulls, convert them to
+  // empty strings.
+  GumboDocument* doc_type = &parser._output->document->v.document;
+  if (doc_type->name == NULL) {
+    doc_type->name = gumbo_copy_stringz(&parser, "");
+  }
+  if (doc_type->public_identifier == NULL) {
+    doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
+  }
+  if (doc_type->system_identifier == NULL) {
+    doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
+  }
+
+  parser_state_destroy(&parser);
+  gumbo_tokenizer_state_destroy(&parser);
+  return parser._output;
+}
+
+void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
+  // Need a dummy GumboParser because the allocator comes along with the
+  // options object.
+  GumboParser parser;
+  parser._options = options;
+  destroy_node(&parser, node);
+}
+
+void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
+  // Need a dummy GumboParser because the allocator comes along with the
+  // options object.
+  GumboParser parser;
+  parser._options = options;
+  destroy_node(&parser, output->document);
+  for (int i = 0; i < output->errors.length; ++i) {
+    gumbo_error_destroy(&parser, output->errors.data[i]);
+  }
+  gumbo_vector_destroy(&parser, &output->errors);
+  gumbo_parser_deallocate(&parser, output);
+}

+ 57 - 0
gumbo/parser.h

@@ -0,0 +1,57 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// Contains the definition of the top-level GumboParser structure that's
+// threaded through basically every internal function in the library.
+
+#ifndef GUMBO_PARSER_H_
+#define GUMBO_PARSER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParserState;
+struct GumboInternalOutput;
+struct GumboInternalOptions;
+struct GumboInternalTokenizerState;
+
+// An overarching struct that's threaded through (nearly) all functions in the
+// library, OOP-style.  This gives each function access to the options and
+// output, along with any internal state needed for the parse.
+typedef struct GumboInternalParser {
+  // Settings for this parse run.
+  const struct GumboInternalOptions* _options;
+
+  // Output for the parse.
+  struct GumboInternalOutput* _output;
+
+  // The internal tokenizer state, defined as a pointer to avoid a cyclic
+  // dependency on html5tokenizer.h.  The main parse routine is responsible for
+  // initializing this on parse start, and destroying it on parse end.
+  // End-users will never see a non-garbage value in this pointer.
+  struct GumboInternalTokenizerState* _tokenizer_state;
+
+  // The internal parser state.  Initialized on parse start and destroyed on
+  // parse end; end-users will never see a non-garbage value in this pointer.
+  struct GumboInternalParserState* _parser_state;
+} GumboParser;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_PARSER_H_

+ 106 - 0
gumbo/string_buffer.c

@@ -0,0 +1,106 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "string_buffer.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "string_piece.h"
+#include "util.h"
+
+struct GumboInternalParser;
+
+static const size_t kDefaultStringBufferSize = 10;
+
+static void maybe_resize_string_buffer(
+    struct GumboInternalParser* parser, size_t additional_chars,
+    GumboStringBuffer* buffer) {
+  size_t new_length = buffer->length + additional_chars;
+  size_t new_capacity = buffer->capacity;
+  while (new_capacity < new_length) {
+    new_capacity *= 2;
+  }
+  if (new_capacity != buffer->capacity) {
+    char* new_data = gumbo_parser_allocate(parser, new_capacity);
+    memcpy(new_data, buffer->data, buffer->length);
+    gumbo_parser_deallocate(parser, buffer->data);
+    buffer->data = new_data;
+    buffer->capacity = new_capacity;
+  }
+}
+
+void gumbo_string_buffer_init(
+    struct GumboInternalParser* parser, GumboStringBuffer* output) {
+  output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
+  output->length = 0;
+  output->capacity = kDefaultStringBufferSize;
+}
+
+void gumbo_string_buffer_reserve(
+    struct GumboInternalParser* parser, size_t min_capacity,
+    GumboStringBuffer* output) {
+  maybe_resize_string_buffer(parser, min_capacity - output->length, output);
+}
+
+void gumbo_string_buffer_append_codepoint(
+    struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
+  // num_bytes is actually the number of continuation bytes, 1 less than the
+  // total number of bytes.  This is done to keep the loop below simple and
+  // should probably change if we unroll it.
+  int num_bytes, prefix;
+  if (c <= 0x7f) {
+    num_bytes = 0;
+    prefix = 0;
+  } else if (c <= 0x7ff) {
+    num_bytes = 1;
+    prefix = 0xc0;
+  } else if (c <= 0xffff) {
+    num_bytes = 2;
+    prefix = 0xe0;
+  } else {
+    num_bytes = 3;
+    prefix = 0xf0;
+  }
+  maybe_resize_string_buffer(parser, num_bytes + 1, output);
+  output->data[output->length++] = prefix | (c >> (num_bytes * 6));
+  for (int i = num_bytes - 1; i >= 0; --i) {
+    output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
+  }
+}
+
+void gumbo_string_buffer_append_string(
+    struct GumboInternalParser* parser, GumboStringPiece* str,
+    GumboStringBuffer* output) {
+  maybe_resize_string_buffer(parser, str->length, output);
+  memcpy(output->data + output->length, str->data, str->length);
+  output->length += str->length;
+}
+
+char* gumbo_string_buffer_to_string(
+    struct GumboInternalParser* parser, GumboStringBuffer* input) {
+  char* buffer = gumbo_parser_allocate(parser, input->length + 1);
+  memcpy(buffer, input->data, input->length);
+  buffer[input->length] = '\0';
+  return buffer;
+}
+
+void gumbo_string_buffer_destroy(
+    struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
+  gumbo_parser_deallocate(parser, buffer->data);
+}

+ 81 - 0
gumbo/string_buffer.h

@@ -0,0 +1,81 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+#ifndef GUMBO_STRING_BUFFER_H_
+#define GUMBO_STRING_BUFFER_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "gumbo.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+
+// A struct representing a mutable, growable string.  This consists of a
+// heap-allocated buffer that may grow (by doubling) as necessary.  When
+// converting to a string, this allocates a new buffer that is only as long as
+// it needs to be.  Note that the internal buffer here is *not* nul-terminated,
+// so be sure not to use ordinary string manipulation functions on it.
+typedef struct {
+  // A pointer to the beginning of the string.  NULL iff length == 0.
+  char* data;
+
+  // The length of the string fragment, in bytes.  May be zero.
+  size_t length;
+
+  // The capacity of the buffer, in bytes.
+  size_t capacity;
+} GumboStringBuffer;
+
+// Initializes a new GumboStringBuffer.
+void gumbo_string_buffer_init(
+    struct GumboInternalParser* parser, GumboStringBuffer* output);
+
+// Ensures that the buffer contains at least a certain amount of space.  Most
+// useful with snprintf and the other length-delimited string functions, which
+// may want to write directly into the buffer.
+void gumbo_string_buffer_reserve(
+    struct GumboInternalParser* parser, size_t min_capacity,
+    GumboStringBuffer* output);
+
+// Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
+// This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
+// value of the codepoint.
+void gumbo_string_buffer_append_codepoint(
+    struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
+
+// Appends a string onto the end of the GumboStringBuffer.
+void gumbo_string_buffer_append_string(
+    struct GumboInternalParser* parser, GumboStringPiece* str,
+    GumboStringBuffer* output);
+
+// Converts this string buffer to const char*, alloctaing a new buffer for it.
+char* gumbo_string_buffer_to_string(
+    struct GumboInternalParser* parser, GumboStringBuffer* input);
+
+// Deallocates this GumboStringBuffer.
+void gumbo_string_buffer_destroy(
+    struct GumboInternalParser* parser, GumboStringBuffer* buffer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_STRING_BUFFER_H_

+ 49 - 0
gumbo/string_piece.c

@@ -0,0 +1,49 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "string_piece.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "util.h"
+
+struct GumboInternalParser;
+
+const GumboStringPiece kGumboEmptyString = { NULL, 0 };
+
+bool gumbo_string_equals(
+    const GumboStringPiece* str1, const GumboStringPiece* str2) {
+  return str1->length == str2->length &&
+      !memcmp(str1->data, str2->data, str1->length);
+}
+
+bool gumbo_string_equals_ignore_case(
+    const GumboStringPiece* str1, const GumboStringPiece* str2) {
+  return str1->length == str2->length &&
+      !strncasecmp(str1->data, str2->data, str1->length);
+}
+
+void gumbo_string_copy(
+    struct GumboInternalParser* parser, GumboStringPiece* dest,
+    const GumboStringPiece* source) {
+  dest->length = source->length;
+  char* buffer = gumbo_parser_allocate(parser, source->length);
+  memcpy(buffer, source->data, source->length);
+  dest->data = buffer;
+}

+ 39 - 0
gumbo/string_piece.h

@@ -0,0 +1,39 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#ifndef GUMBO_STRING_PIECE_H_
+#define GUMBO_STRING_PIECE_H_
+
+#include "gumbo.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+
+// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
+// destination and copying over the characters from source.  Dest should be
+// empty, with no buffer allocated; otherwise, this leaks it.
+void gumbo_string_copy(
+    struct GumboInternalParser* parser, GumboStringPiece* dest,
+    const GumboStringPiece* source);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_STRING_PIECE_H_

+ 225 - 0
gumbo/tag.c

@@ -0,0 +1,225 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "gumbo.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <strings.h>    // For strcasecmp.
+
+// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
+// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
+// most common tag names first, or to putting them in alphabetical order and
+// using a binary search.
+const char* kGumboTagNames[] = {
+  "html",
+  "head",
+  "title",
+  "base",
+  "link",
+  "meta",
+  "style",
+  "script",
+  "noscript",
+  "template",
+  "body",
+  "article",
+  "section",
+  "nav",
+  "aside",
+  "h1",
+  "h2",
+  "h3",
+  "h4",
+  "h5",
+  "h6",
+  "hgroup",
+  "header",
+  "footer",
+  "address",
+  "p",
+  "hr",
+  "pre",
+  "blockquote",
+  "ol",
+  "ul",
+  "li",
+  "dl",
+  "dt",
+  "dd",
+  "figure",
+  "figcaption",
+  "main",
+  "div",
+  "a",
+  "em",
+  "strong",
+  "small",
+  "s",
+  "cite",
+  "q",
+  "dfn",
+  "abbr",
+  "data",
+  "time",
+  "code",
+  "var",
+  "samp",
+  "kbd",
+  "sub",
+  "sup",
+  "i",
+  "b",
+  "u",
+  "mark",
+  "ruby",
+  "rt",
+  "rp",
+  "bdi",
+  "bdo",
+  "span",
+  "br",
+  "wbr",
+  "ins",
+  "del",
+  "image",
+  "img",
+  "iframe",
+  "embed",
+  "object",
+  "param",
+  "video",
+  "audio",
+  "source",
+  "track",
+  "canvas",
+  "map",
+  "area",
+  "math",
+  "mi",
+  "mo",
+  "mn",
+  "ms",
+  "mtext",
+  "mglyph",
+  "malignmark",
+  "annotation-xml",
+  "svg",
+  "foreignobject",
+  "desc",
+  "table",
+  "caption",
+  "colgroup",
+  "col",
+  "tbody",
+  "thead",
+  "tfoot",
+  "tr",
+  "td",
+  "th",
+  "form",
+  "fieldset",
+  "legend",
+  "label",
+  "input",
+  "button",
+  "select",
+  "datalist",
+  "optgroup",
+  "option",
+  "textarea",
+  "keygen",
+  "output",
+  "progress",
+  "meter",
+  "details",
+  "summary",
+  "menu",
+  "menuitem",
+  "applet",
+  "acronym",
+  "bgsound",
+  "dir",
+  "frame",
+  "frameset",
+  "noframes",
+  "isindex",
+  "listing",
+  "xmp",
+  "nextid",
+  "noembed",
+  "plaintext",
+  "rb",
+  "strike",
+  "basefont",
+  "big",
+  "blink",
+  "center",
+  "font",
+  "marquee",
+  "multicol",
+  "nobr",
+  "spacer",
+  "tt",
+  "",                   // TAG_UNKNOWN
+  "",                   // TAG_LAST
+};
+
+const char* gumbo_normalized_tagname(GumboTag tag) {
+  assert(tag <= GUMBO_TAG_LAST);
+  return kGumboTagNames[tag];
+}
+
+// TODO(jdtang): Add test for this.
+void gumbo_tag_from_original_text(GumboStringPiece* text) {
+  if (text->data == NULL) {
+    return;
+  }
+
+  assert(text->length >= 2);
+  assert(text->data[0] == '<');
+  assert(text->data[text->length - 1] == '>');
+  if (text->data[1] == '/') {
+    // End tag.
+    assert(text->length >= 3);
+    text->data += 2;    // Move past </
+    text->length -= 3;
+  } else {
+    // Start tag.
+    text->data += 1;    // Move past <
+    text->length -= 2;
+    // strnchr is apparently not a standard C library function, so I loop
+    // explicitly looking for whitespace or other illegal tag characters.
+    for (const char* c = text->data; c != text->data + text->length; ++c) {
+      if (isspace(*c) || *c == '/') {
+        text->length = c - text->data;
+        break;
+      }
+    }
+  }
+}
+
+GumboTag gumbo_tag_enum(const char* tagname) {
+  for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
+    // TODO(jdtang): strcasecmp is non-portable, so if we want to support
+    // non-GCC compilers, we'll need some #ifdef magic.  This source already has
+    // pretty significant issues with MSVC6 anyway.
+    if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
+      return i;
+    }
+  }
+  return GUMBO_TAG_UNKNOWN;
+}

+ 40 - 0
gumbo/token_type.h

@@ -0,0 +1,40 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#ifndef GUMBO_TOKEN_TYPE_H_
+#define GUMBO_TOKEN_TYPE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// An enum representing the type of token.
+typedef enum {
+  GUMBO_TOKEN_DOCTYPE,
+  GUMBO_TOKEN_START_TAG,
+  GUMBO_TOKEN_END_TAG,
+  GUMBO_TOKEN_COMMENT,
+  GUMBO_TOKEN_WHITESPACE,
+  GUMBO_TOKEN_CHARACTER,
+  GUMBO_TOKEN_NULL,
+  GUMBO_TOKEN_EOF
+} GumboTokenType;
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+
+#endif  // GUMBO_TOKEN_TYPE_H_

+ 2978 - 0
gumbo/tokenizer.c

@@ -0,0 +1,2978 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// Coding conventions specific to this file:
+//
+// 1. Functions that fill in a token should be named emit_*, and should be
+// followed immediately by a return from the tokenizer (true if no error
+// occurred, false if an error occurred).  Sometimes the emit functions
+// themselves return a boolean so that they can be combined with the return
+// statement; in this case, they should match this convention.
+// 2. Functions that shuffle data from temporaries to final API structures
+// should be named finish_*, and be called just before the tokenizer exits the
+// state that accumulates the temporary.
+// 3. All internal data structures should be kept in an initialized state from
+// tokenizer creation onwards, ready to accept input.  When a buffer's flushed
+// and reset, it should be deallocated and immediately reinitialized.
+// 4. Make sure there are appropriate break statements following each state.
+// 5. Assertions on the state of the temporary and tag buffers are usually a
+// good idea, and should go at the entry point of each state when added.
+// 6. Statement order within states goes:
+//    1. Add parse errors, if appropriate.
+//    2. Call finish_* functions to build up tag state.
+//    2. Switch to new state.  Set _reconsume flag if appropriate.
+//    3. Perform any other temporary buffer manipulation.
+//    4. Emit tokens
+//    5. Return/break.
+// This order ensures that we can verify that every emit is followed by a
+// return, ensures that the correct state is recorded with any parse errors, and
+// prevents parse error position from being messed up by possible mark/resets in
+// temporary buffer manipulation.
+
+
+#include "tokenizer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "attribute.h"
+#include "char_ref.h"
+#include "error.h"
+#include "gumbo.h"
+#include "parser.h"
+#include "string_buffer.h"
+#include "string_piece.h"
+#include "token_type.h"
+#include "tokenizer_states.h"
+#include "utf8.h"
+#include "util.h"
+#include "vector.h"
+
+// Compared against _script_data_buffer to determine if we're in double-escaped
+// script mode.
+const GumboStringPiece kScriptTag = { "script", 6 };
+
+// An enum for the return value of each individual state.
+typedef enum {
+  RETURN_ERROR,         // Return false (error) from the tokenizer.
+  RETURN_SUCCESS,       // Return true (success) from the tokenizer.
+  NEXT_CHAR             // Proceed to the next character and continue lexing.
+} StateResult;
+
+// This is a struct containing state necessary to build up a tag token,
+// character by character.
+typedef struct GumboInternalTagState {
+  // A buffer to accumulate characters for various GumboStringPiece fields.
+  GumboStringBuffer _buffer;
+
+  // A pointer to the start of the original text corresponding to the contents
+  // of the buffer.
+  const char* _original_text;
+
+  // The current tag enum, computed once the tag name state has finished so that
+  // the buffer can be re-used for building up attributes.
+  GumboTag _tag;
+
+  // The starting location of the text in the buffer.
+  GumboSourcePosition _start_pos;
+
+  // The current list of attributes.  This is copied (and ownership of its data
+  // transferred) to the GumboStartTag token upon completion of the tag.  New
+  // attributes are added as soon as their attribute name state is complete, and
+  // values are filled in by operating on _attributes.data[attributes.length-1].
+  GumboVector /* GumboAttribute */ _attributes;
+
+  // If true, the next attribute value to be finished should be dropped.  This
+  // happens if a duplicate attribute name is encountered - we want to consume
+  // the attribute value, but shouldn't overwrite the existing value.
+  bool _drop_next_attr_value;
+
+  // The state that caused the tokenizer to switch into a character reference in
+  // attribute value state.  This is used to set the additional allowed
+  // character, and is switched back to on completion.  Initialized as the
+  // tokenizer enters the character reference state.
+  GumboTokenizerEnum _attr_value_state;
+
+  // The last start tag to have been emitted by the tokenizer.  This is
+  // necessary to check for appropriate end tags.
+  GumboTag _last_start_tag;
+
+  // If true, then this is a start tag.  If false, it's an end tag.  This is
+  // necessary to generate the appropriate token type at tag-closing time.
+  bool _is_start_tag;
+
+  // If true, then this tag is "self-closing" and doesn't have an end tag.
+  bool _is_self_closing;
+} GumboTagState;
+
+// This is the main tokenizer state struct, containing all state used by in
+// tokenizing the input stream.
+typedef struct GumboInternalTokenizerState {
+  // The current lexer state.  Starts in GUMBO_LEX_DATA.
+  GumboTokenizerEnum _state;
+
+  // A flag indicating whether the current input character needs to reconsumed
+  // in another state, or whether the next input character should be read for
+  // the next iteration of the state loop.  This is set when the spec reads
+  // "Reconsume the current input character in..."
+  bool _reconsume_current_input;
+
+  // A flag indicating whether the current node is a foreign element.  This is
+  // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
+  // markup declaration state.
+  bool _is_current_node_foreign;
+
+  // Certain states (notably character references) may emit two character tokens
+  // at once, but the contract for lex() fills in only one token at a time.  The
+  // extra character is buffered here, and then this is checked on entry to
+  // lex().  If a character is stored here, it's immediately emitted and control
+  // returns from the lexer.  kGumboNoChar is used to represent 'no character
+  // stored.'
+  //
+  // Note that characters emitted through this mechanism will have their source
+  // position marked as the character under the mark, i.e. multiple characters
+  // may be emitted with the same position.  This is desirable for character
+  // references, but unsuitable for many other cases.  Use the _temporary_buffer
+  // mechanism if the buffered characters must have their original positions in
+  // the document.
+  int _buffered_emit_char;
+
+  // A temporary buffer to accumulate characters, as described by the "temporary
+  // buffer" phrase in the tokenizer spec.  We use this in a somewhat unorthodox
+  // way: we record the specific character to go into the buffer, which may
+  // sometimes be a lowercased version of the actual input character.  However,
+  // we *also* use utf8iterator_mark() to record the position at tag start.
+  // When we start flushing the temporary buffer, we set _temporary_buffer_emit
+  // to the start of it, and then increment it for each call to the tokenizer.
+  // We also call utf8iterator_reset(), and utf8iterator_next() through the
+  // input stream, so that tokens emitted by emit_char have the correct position
+  // and original text.
+  GumboStringBuffer _temporary_buffer;
+
+  // The current cursor position we're emitting from within
+  // _temporary_buffer.data.  NULL whenever we're not flushing the buffer.
+  const char* _temporary_buffer_emit;
+
+  // The temporary buffer is also used by the spec to check whether we should
+  // enter the script data double escaped state, but we can't use the same
+  // buffer for both because we have to flush out "<s" as emits while still
+  // maintaining the context that will eventually become "script".  This is a
+  // separate buffer that's used in place of the temporary buffer for states
+  // that may enter the script data double escape start state.
+  GumboStringBuffer _script_data_buffer;
+
+  // Pointer to the beginning of the current token in the original buffer; used
+  // to record the original text.
+  const char* _token_start;
+
+  // GumboSourcePosition recording the source location of the start of the
+  // current token.
+  GumboSourcePosition _token_start_pos;
+
+  // Current tag state.
+  GumboTagState _tag_state;
+
+  // Doctype state.  We use the temporary buffer to accumulate characters (it's
+  // not used for anything else in the doctype states), and then freshly
+  // allocate the strings in the doctype token, then copy it over on emit.
+  GumboTokenDocType _doc_type_state;
+
+  // The UTF8Iterator over the tokenizer input.
+  Utf8Iterator _input;
+} GumboTokenizerState;
+
+// Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
+static void add_parse_error(GumboParser* parser, GumboErrorType type) {
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  utf8iterator_get_position(&tokenizer->_input, &error->position);
+  error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
+  error->type = type;
+  error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
+  switch (tokenizer->_state) {
+    case GUMBO_LEX_DATA:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
+      break;
+    case GUMBO_LEX_CHAR_REF_IN_DATA:
+    case GUMBO_LEX_CHAR_REF_IN_RCDATA:
+    case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
+      break;
+    case GUMBO_LEX_RCDATA:
+    case GUMBO_LEX_RCDATA_LT:
+    case GUMBO_LEX_RCDATA_END_TAG_OPEN:
+    case GUMBO_LEX_RCDATA_END_TAG_NAME:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
+      break;
+    case GUMBO_LEX_RAWTEXT:
+    case GUMBO_LEX_RAWTEXT_LT:
+    case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
+    case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
+      break;
+    case GUMBO_LEX_PLAINTEXT:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
+      break;
+    case GUMBO_LEX_SCRIPT:
+    case GUMBO_LEX_SCRIPT_LT:
+    case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
+    case GUMBO_LEX_SCRIPT_END_TAG_NAME:
+    case GUMBO_LEX_SCRIPT_ESCAPED_START:
+    case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
+    case GUMBO_LEX_SCRIPT_ESCAPED:
+    case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
+    case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
+    case GUMBO_LEX_SCRIPT_ESCAPED_LT:
+    case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
+    case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
+    case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
+      break;
+    case GUMBO_LEX_TAG_OPEN:
+    case GUMBO_LEX_END_TAG_OPEN:
+    case GUMBO_LEX_TAG_NAME:
+    case GUMBO_LEX_BEFORE_ATTR_NAME:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
+      break;
+    case GUMBO_LEX_SELF_CLOSING_START_TAG:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
+      break;
+    case GUMBO_LEX_ATTR_NAME:
+    case GUMBO_LEX_AFTER_ATTR_NAME:
+    case GUMBO_LEX_BEFORE_ATTR_VALUE:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
+      break;
+    case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
+    case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
+    case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
+    case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
+      break;
+    case GUMBO_LEX_BOGUS_COMMENT:
+    case GUMBO_LEX_COMMENT_START:
+    case GUMBO_LEX_COMMENT_START_DASH:
+    case GUMBO_LEX_COMMENT:
+    case GUMBO_LEX_COMMENT_END_DASH:
+    case GUMBO_LEX_COMMENT_END:
+    case GUMBO_LEX_COMMENT_END_BANG:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
+      break;
+    case GUMBO_LEX_MARKUP_DECLARATION:
+    case GUMBO_LEX_DOCTYPE:
+    case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
+    case GUMBO_LEX_DOCTYPE_NAME:
+    case GUMBO_LEX_AFTER_DOCTYPE_NAME:
+    case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
+    case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
+    case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
+    case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
+    case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
+    case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
+    case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
+    case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
+    case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
+    case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
+    case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
+    case GUMBO_LEX_BOGUS_DOCTYPE:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
+      break;
+    case GUMBO_LEX_CDATA:
+      error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
+      break;
+  }
+}
+
+static bool is_alpha(int c) {
+  // We don't use ISO C isupper/islower functions here because they
+  // depend upon the program's locale, while the behavior of the HTML5 spec is
+  // independent of which locale the program is run in.
+  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+static int ensure_lowercase(int c) {
+  return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
+}
+
+static GumboTokenType get_char_token_type(int c) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\r':
+    case '\f':
+    case ' ':
+      return GUMBO_TOKEN_WHITESPACE;
+    case 0:
+      gumbo_debug("Emitted null byte.\n");
+      return GUMBO_TOKEN_NULL;
+    case -1:
+      return GUMBO_TOKEN_EOF;
+    default:
+      return GUMBO_TOKEN_CHARACTER;
+  }
+}
+
+// Starts recording characters in the temporary buffer.
+// Because this needs to reset the utf8iterator_mark to the beginning of the
+// text that will eventually be emitted, it needs to be called a couple of
+// states before the spec says "Set the temporary buffer to the empty string".
+// In general, this should be called whenever there's a transition to a
+// "less-than sign state".  The initial < and possibly / then need to be
+// appended to the temporary buffer, their presence needs to be accounted for in
+// states that compare the temporary buffer against a literal value, and
+// spec stanzas that say "emit a < and / character token along with a character
+// token for each character in the temporary buffer" need to be adjusted to
+// account for the presence of the < and / inside the temporary buffer.
+static void clear_temporary_buffer(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  assert(!tokenizer->_temporary_buffer_emit);
+  utf8iterator_mark(&tokenizer->_input);
+  gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
+  gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
+  // The temporary buffer and script data buffer are the same object in the
+  // spec, so the script data buffer should be cleared as well.
+  gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
+  gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+}
+
+// Appends a codepoint to the temporary buffer.
+static void append_char_to_temporary_buffer(
+    GumboParser* parser, int codepoint) {
+  gumbo_string_buffer_append_codepoint(
+      parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
+}
+
+// Checks to see if the temporary buffer equals a certain string.
+// Make sure this remains side-effect free; it's used in assertions.
+#ifndef NDEBUG
+static bool temporary_buffer_equals(
+    GumboParser* parser, const char* text) {
+  GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
+  // TODO(jdtang): See if the extra strlen is a performance problem, and replace
+  // it with an explicit sizeof(literal) if necessary.  I don't think it will
+  // be, as this is only used in a couple of rare states.
+  int text_len = strlen(text);
+  return text_len == buffer->length &&
+      memcmp(buffer->data, text, text_len) == 0;
+}
+#endif
+
+static void doc_type_state_init(GumboParser* parser) {
+  GumboTokenDocType* doc_type_state =
+      &parser->_tokenizer_state->_doc_type_state;
+  // We initialize these to NULL here so that we don't end up leaking memory if
+  // we never see a doctype token.  When we do see a doctype token, we reset
+  // them to a freshly-allocated empty string so that we can present a uniform
+  // interface to client code and not make them check for null.  Ownership is
+  // transferred to the doctype token when it's emitted.
+  doc_type_state->name = NULL;
+  doc_type_state->public_identifier = NULL;
+  doc_type_state->system_identifier = NULL;
+  doc_type_state->force_quirks = false;
+  doc_type_state->has_public_identifier = false;
+  doc_type_state->has_system_identifier = false;
+}
+
+// Sets the token original_text and position to the current iterator position.
+// This is necessary because [CDATA[ sections may include text that is ignored
+// by the tokenizer.
+static void reset_token_start_point(GumboTokenizerState* tokenizer) {
+  tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
+  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
+}
+
+// Sets the tag buffer original text and start point to the current iterator
+// position.  This is necessary because attribute names & values may have
+// whitespace preceeding them, and so we can't assume that the actual token
+// starting point was the end of the last tag buffer usage.
+static void reset_tag_buffer_start_point(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+
+  utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
+  tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
+}
+
+// Moves the temporary buffer contents over to the specified output string,
+// and clears the temporary buffer.
+static void finish_temporary_buffer(GumboParser* parser, const char** output) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  *output =
+      gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
+  clear_temporary_buffer(parser);
+}
+
+// Advances the iterator past the end of the token, and then fills in the
+// relevant position fields.  It's assumed that after every emit, the tokenizer
+// will immediately return (letting the tree-construction stage read the filled
+// in Token).  Thus, it's safe to advance the input stream here, since it will
+// bypass the advance at the bottom of the state machine loop.
+//
+// Since this advances the iterator and resets the current input, make sure to
+// call it after you've recorded any other data you need for the token.
+static void finish_token(GumboParser* parser, GumboToken* token) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  if (!tokenizer->_reconsume_current_input) {
+    utf8iterator_next(&tokenizer->_input);
+  }
+
+  token->position = tokenizer->_token_start_pos;
+  token->original_text.data = tokenizer->_token_start;
+  reset_token_start_point(tokenizer);
+  token->original_text.length =
+      tokenizer->_token_start - token->original_text.data;
+  if (token->original_text.length > 0 && token->original_text.data[token->original_text.length - 1] == '\r') {
+    // The UTF8 iterator will ignore carriage returns in the input stream, which
+    // means that the next token may start one past a \r character.  The pointer
+    // arithmetic above results in that \r being appended to the original text
+    // of the preceding token, so we have to adjust its length here to chop the
+    // \r off.
+    --token->original_text.length;
+  }
+}
+
+// Records the doctype public ID, assumed to be in the temporary buffer.
+// Convenience method that also sets has_public_identifier to true.
+static void finish_doctype_public_id(GumboParser* parser) {
+  GumboTokenDocType* doc_type_state =
+      &parser->_tokenizer_state->_doc_type_state;
+  gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
+  finish_temporary_buffer(parser, &doc_type_state->public_identifier);
+  doc_type_state->has_public_identifier = true;
+}
+
+// Records the doctype system ID, assumed to be in the temporary buffer.
+// Convenience method that also sets has_system_identifier to true.
+static void finish_doctype_system_id(GumboParser* parser) {
+  GumboTokenDocType* doc_type_state =
+      &parser->_tokenizer_state->_doc_type_state;
+  gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
+  finish_temporary_buffer(parser, &doc_type_state->system_identifier);
+  doc_type_state->has_system_identifier = true;
+}
+
+// Writes a single specified character to the output token.
+static void emit_char(GumboParser* parser, int c, GumboToken* output) {
+  output->type = get_char_token_type(c);
+  output->v.character = c;
+  finish_token(parser, output);
+}
+
+// Writes a replacement character token and records a parse error.
+// Always returns RETURN_ERROR, per gumbo_lex return value.
+static StateResult emit_replacement_char(
+    GumboParser* parser, GumboToken* output) {
+  // In all cases, this is because of a null byte in the input stream.
+  add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+  emit_char(parser, kUtf8ReplacementChar, output);
+  return RETURN_ERROR;
+}
+
+// Writes an EOF character token.  Always returns RETURN_SUCCESS.
+static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
+  emit_char(parser, -1, output);
+  return RETURN_SUCCESS;
+}
+
+// Writes the current input character out as a character token.
+// Always returns RETURN_SUCCESS.
+static bool emit_current_char(GumboParser* parser, GumboToken* output) {
+  emit_char(
+      parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
+  return RETURN_SUCCESS;
+}
+
+// Writes out a doctype token, copying it from the tokenizer state.
+static void emit_doctype(GumboParser* parser, GumboToken* output) {
+  output->type = GUMBO_TOKEN_DOCTYPE;
+  output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
+  finish_token(parser, output);
+  doc_type_state_init(parser);
+}
+
+// Debug-only function that explicitly sets the attribute vector data to NULL so
+// it can be asserted on tag creation, verifying that there are no memory leaks.
+static void mark_tag_state_as_empty(GumboTagState* tag_state) {
+#ifndef NDEBUG
+  tag_state->_attributes = kGumboEmptyVector;
+#endif
+}
+
+// Writes out the current tag as a start or end tag token.
+// Always returns RETURN_SUCCESS.
+static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
+  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+  if (tag_state->_is_start_tag) {
+    output->type = GUMBO_TOKEN_START_TAG;
+    output->v.start_tag.tag = tag_state->_tag;
+    output->v.start_tag.attributes = tag_state->_attributes;
+    output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
+    tag_state->_last_start_tag = tag_state->_tag;
+    mark_tag_state_as_empty(tag_state);
+    gumbo_debug("Emitted start tag %s.\n",
+               gumbo_normalized_tagname(tag_state->_tag));
+  } else {
+    output->type = GUMBO_TOKEN_END_TAG;
+    output->v.end_tag = tag_state->_tag;
+    // In end tags, ownership of the attributes vector is not transferred to the
+    // token, but it's still initialized as normal, so it must be manually
+    // deallocated.  There may also be attributes to destroy, in certain broken
+    // cases like </div</th> (the "th" is an attribute there).
+    for (int i = 0; i < tag_state->_attributes.length; ++i) {
+      gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
+    }
+    gumbo_parser_deallocate(parser, tag_state->_attributes.data);
+    mark_tag_state_as_empty(tag_state);
+    gumbo_debug("Emitted end tag %s.\n",
+               gumbo_normalized_tagname(tag_state->_tag));
+  }
+  gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
+  finish_token(parser, output);
+  gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
+  assert(output->original_text.length >= 2);
+  assert(output->original_text.data[0] == '<');
+  assert(output->original_text.data[output->original_text.length - 1] == '>');
+  return RETURN_SUCCESS;
+}
+
+// In some states, we speculatively start a tag, but don't know whether it'll be
+// emitted as tag token or as a series of character tokens until we finish it.
+// We need to abandon the tag we'd started & free its memory in that case to
+// avoid a memory leak.
+static void abandon_current_tag(GumboParser* parser) {
+  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+  for (int i = 0; i < tag_state->_attributes.length; ++i) {
+    gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
+  }
+  gumbo_parser_deallocate(parser, tag_state->_attributes.data);
+  mark_tag_state_as_empty(tag_state);
+  gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
+  gumbo_debug("Abandoning current tag.\n");
+}
+
+// Wraps the consume_char_ref function to handle its output and make the
+// appropriate TokenizerState modifications.  Returns RETURN_ERROR if a parse
+// error occurred, RETURN_SUCCESS otherwise.
+static StateResult emit_char_ref(
+    GumboParser* parser, int additional_allowed_char,
+    bool is_in_attribute, GumboToken* output) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  OneOrTwoCodepoints char_ref;
+  bool status = consume_char_ref(
+      parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
+  if (char_ref.first != kGumboNoChar) {
+    // consume_char_ref ends with the iterator pointing at the next character,
+    // so we need to be sure not advance it again before reading the next token.
+    tokenizer->_reconsume_current_input = true;
+    emit_char(parser, char_ref.first, output);
+    tokenizer->_buffered_emit_char = char_ref.second;
+  } else {
+    emit_char(parser, '&', output);
+  }
+  return status ? RETURN_SUCCESS : RETURN_ERROR;
+}
+
+// Emits a comment token.  Comments use the temporary buffer to accumulate their
+// data, and then it's copied over and released to the 'text' field of the
+// GumboToken union.  Always returns RETURN_SUCCESS.
+static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
+  output->type = GUMBO_TOKEN_COMMENT;
+  finish_temporary_buffer(parser, &output->v.text);
+  finish_token(parser, output);
+  return RETURN_SUCCESS;
+}
+
+// Checks to see we should be flushing accumulated characters in the temporary
+// buffer, and fills the output token with the next output character if so.
+// Returns true if a character has been emitted and the tokenizer should
+// immediately return, false if we're at the end of the temporary buffer and
+// should resume normal operation.
+static bool maybe_emit_from_temporary_buffer(
+    GumboParser* parser, GumboToken* output) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  const char* c = tokenizer->_temporary_buffer_emit;
+  GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
+
+  if (!c || c >= buffer->data + buffer->length) {
+    tokenizer->_temporary_buffer_emit = NULL;
+    return false;
+  }
+
+  assert(*c == utf8iterator_current(&tokenizer->_input));
+  // emit_char also advances the input stream.  We need to do some juggling of
+  // the _reconsume_current_input flag to get the proper behavior when emitting
+  // previous tokens.  Basically, _reconsume_current_input should *never* be set
+  // when emitting anything from the temporary buffer, since those characters
+  // have already been advanced past.  However, it should be preserved so that
+  // when the *next* character is encountered again, the tokenizer knows not to
+  // advance past it.
+  bool saved_reconsume_state = tokenizer->_reconsume_current_input;
+  tokenizer->_reconsume_current_input = false;
+  emit_char(parser, *c, output);
+  ++tokenizer->_temporary_buffer_emit;
+  tokenizer->_reconsume_current_input = saved_reconsume_state;
+  return true;
+}
+
+// Sets up the tokenizer to begin flushing the temporary buffer.
+// This resets the input iterator stream to the start of the last tag, sets up
+// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
+// the first character in it.  It returns true if a character was emitted, false
+// otherwise.
+static bool emit_temporary_buffer(
+    GumboParser* parser, GumboToken* output) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  assert(tokenizer->_temporary_buffer.data);
+  utf8iterator_reset(&tokenizer->_input);
+  tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
+  return maybe_emit_from_temporary_buffer(parser, output);
+}
+
+// Appends a codepoint to the current tag buffer.  If
+// reinitilize_position_on_first is set, this also initializes the tag buffer
+// start point; the only time you would *not* want to pass true for this
+// parameter is if you want the original_text to include character (like an
+// opening quote) that doesn't appear in the value.
+static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
+                                      bool reinitilize_position_on_first) {
+  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
+  if (buffer->length == 0 && reinitilize_position_on_first) {
+    reset_tag_buffer_start_point(parser);
+  }
+  gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
+}
+
+// (Re-)initialize the tag buffer.  This also resets the original_text pointer
+// and _start_pos field to point to the current position.
+static void initialize_tag_buffer(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+
+  gumbo_string_buffer_init(parser, &tag_state->_buffer);
+  reset_tag_buffer_start_point(parser);
+}
+
+// Initializes the tag_state to start a new tag, keeping track of the opening
+// positions and original text.  Takes a boolean indicating whether this is a
+// start or end tag.
+static void start_new_tag(GumboParser* parser, bool is_start_tag) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+  int c = utf8iterator_current(&tokenizer->_input);
+  assert(is_alpha(c));
+  c = ensure_lowercase(c);
+  assert(is_alpha(c));
+
+  initialize_tag_buffer(parser);
+  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
+
+  assert(tag_state->_attributes.data == NULL);
+  gumbo_vector_init(parser, 4, &tag_state->_attributes);
+  tag_state->_drop_next_attr_value = false;
+  tag_state->_is_start_tag = is_start_tag;
+  tag_state->_is_self_closing = false;
+  gumbo_debug("Starting new tag.\n");
+}
+
+// Fills in the specified char* with the contents of the tag buffer.
+static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+  *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
+}
+
+// Fills in:
+// * The original_text GumboStringPiece with the portion of the original
+// buffer that corresponds to the tag buffer.
+// * The start_pos GumboSourcePosition with the start position of the tag
+// buffer.
+// * The end_pos GumboSourcePosition with the current source position.
+static void copy_over_original_tag_text(
+    GumboParser* parser, GumboStringPiece* original_text,
+    GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+
+  original_text->data = tag_state->_original_text;
+  original_text->length =
+      utf8iterator_get_char_pointer(&tokenizer->_input) -
+      tag_state->_original_text;
+  if (original_text->data[original_text->length - 1] == '\r') {
+    // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
+    // appended to the end of original text even when it's really the first part
+    // of the next character.  If we detect this situation, shrink the length of
+    // the original text by 1 to remove the carriage return.
+    --original_text->length;
+  }
+  *start_pos = tag_state->_start_pos;
+  utf8iterator_get_position(&tokenizer->_input, end_pos);
+}
+
+// Releases and then re-initializes the tag buffer.
+static void reinitialize_tag_buffer(GumboParser* parser) {
+  gumbo_parser_deallocate(
+      parser, parser->_tokenizer_state->_tag_state._buffer.data);
+  initialize_tag_buffer(parser);
+}
+
+// Moves some data from the temporary buffer over the the tag-based fields in
+// TagState.
+static void finish_tag_name(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+
+  const char* temp;
+  copy_over_tag_buffer(parser, &temp);
+  tag_state->_tag = gumbo_tag_enum(temp);
+  reinitialize_tag_buffer(parser);
+  gumbo_parser_deallocate(parser, (void*) temp);
+}
+
+// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
+static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
+                                     int original_index, int new_index) {
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+  error->type = GUMBO_ERR_DUPLICATE_ATTR;
+  error->position = tag_state->_start_pos;
+  error->original_text = tag_state->_original_text;
+  error->v.duplicate_attr.original_index = original_index;
+  error->v.duplicate_attr.new_index = new_index;
+  copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
+  reinitialize_tag_buffer(parser);
+}
+
+// Creates a new attribute in the current tag, copying the current tag buffer to
+// the attribute's name.  The attribute's value starts out as the empty string
+// (following the "Boolean attributes" section of the spec) and is only
+// overwritten on finish_attribute_value().  If the attribute has already been
+// specified, the new attribute is dropped, a parse error is added, and the
+// function returns false.  Otherwise, this returns true.
+static bool finish_attribute_name(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  GumboTagState* tag_state = &tokenizer->_tag_state;
+  // May've been set by a previous attribute without a value; reset it here.
+  tag_state->_drop_next_attr_value = false;
+  assert(tag_state->_attributes.data);
+  assert(tag_state->_attributes.capacity);
+
+  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
+  for (int i = 0; i < attributes->length; ++i) {
+    GumboAttribute* attr = attributes->data[i];
+    if (strlen(attr->name) == tag_state->_buffer.length &&
+        memcmp(attr->name, tag_state->_buffer.data,
+               tag_state->_buffer.length) == 0) {
+      // Identical attribute; bail.
+      add_duplicate_attr_error(
+          parser, attr->name, i, attributes->length);
+      tag_state->_drop_next_attr_value = true;
+      return false;
+    }
+  }
+
+  GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
+  attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
+  copy_over_tag_buffer(parser, &attr->name);
+  copy_over_original_tag_text(parser, &attr->original_name,
+                              &attr->name_start, &attr->name_end);
+  attr->value = gumbo_copy_stringz(parser, "");
+  copy_over_original_tag_text(parser, &attr->original_value,
+                              &attr->name_start, &attr->name_end);
+  gumbo_vector_add(parser, attr, attributes);
+  reinitialize_tag_buffer(parser);
+  return true;
+}
+
+// Finishes an attribute value.  This sets the value of the most recently added
+// attribute to the current contents of the tag buffer.
+static void finish_attribute_value(GumboParser* parser) {
+  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+  if (tag_state->_drop_next_attr_value) {
+    // Duplicate attribute name detected in an earlier state, so we have to
+    // ignore the value.
+    tag_state->_drop_next_attr_value = false;
+    return;
+  }
+
+  GumboAttribute* attr =
+      tag_state->_attributes.data[tag_state->_attributes.length - 1];
+  gumbo_parser_deallocate(parser, (void*) attr->value);
+  copy_over_tag_buffer(parser, &attr->value);
+  copy_over_original_tag_text(parser, &attr->original_value,
+                              &attr->value_start, &attr->value_end);
+  reinitialize_tag_buffer(parser);
+}
+
+// Returns true if the current end tag matches the last start tag emitted.
+static bool is_appropriate_end_tag(GumboParser* parser) {
+  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
+  assert(!tag_state->_is_start_tag);
+  // Null terminate the current string buffer, so it can be passed to
+  // gumbo_tag_enum, but don't increment the length in case we need to dump the
+  // buffer as character tokens.
+  gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
+  --tag_state->_buffer.length;
+  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
+      tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
+}
+
+void gumbo_tokenizer_state_init(
+    GumboParser* parser, const char* text, size_t text_length) {
+  GumboTokenizerState* tokenizer =
+      gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
+  parser->_tokenizer_state = tokenizer;
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+  tokenizer->_reconsume_current_input = false;
+  tokenizer->_is_current_node_foreign = false;
+  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
+
+  tokenizer->_buffered_emit_char = kGumboNoChar;
+  gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
+  tokenizer->_temporary_buffer_emit = NULL;
+
+  mark_tag_state_as_empty(&tokenizer->_tag_state);
+
+  gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+  tokenizer->_token_start = text;
+  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
+  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
+  doc_type_state_init(parser);
+}
+
+void gumbo_tokenizer_state_destroy(GumboParser* parser) {
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+  assert(tokenizer->_doc_type_state.name == NULL);
+  assert(tokenizer->_doc_type_state.public_identifier == NULL);
+  assert(tokenizer->_doc_type_state.system_identifier == NULL);
+  gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
+  gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
+  gumbo_parser_deallocate(parser, tokenizer);
+}
+
+void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
+  parser->_tokenizer_state->_state = state;
+}
+
+void gumbo_tokenizer_set_is_current_node_foreign(
+    GumboParser* parser, bool is_foreign) {
+  if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
+    gumbo_debug("Toggling is_current_node_foreign to %s.\n",
+                is_foreign ? "true" : "false");
+  }
+  parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
+static StateResult handle_data_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '&':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
+      // The char_ref machinery expects to be on the & so it can mark that
+      // and return to it if the text isn't a char ref, so we need to
+      // reconsume it.
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, '<');
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      emit_char(parser, c, output);
+      return RETURN_ERROR;
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
+static StateResult handle_char_ref_in_data_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+  return emit_char_ref(parser, ' ', false, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
+static StateResult handle_rcdata_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '&':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, '<');
+      return NEXT_CHAR;
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      return emit_eof(parser, output);
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
+static StateResult handle_char_ref_in_rcdata_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+  return emit_char_ref(parser, ' ', false, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
+static StateResult handle_rawtext_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, '<');
+      return NEXT_CHAR;
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      return emit_eof(parser, output);
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
+static StateResult handle_script_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, '<');
+      return NEXT_CHAR;
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      return emit_eof(parser, output);
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
+static StateResult handle_plaintext_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      return emit_eof(parser, output);
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
+static StateResult handle_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "<"));
+  switch (c) {
+    case '!':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
+      clear_temporary_buffer(parser);
+      return NEXT_CHAR;
+    case '/':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
+      append_char_to_temporary_buffer(parser, '/');
+      return NEXT_CHAR;
+    case '?':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, '?');
+      add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
+      return NEXT_CHAR;
+    default:
+      if (is_alpha(c)) {
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
+        start_new_tag(parser, true);
+        return NEXT_CHAR;
+      } else {
+        add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+        emit_temporary_buffer(parser, output);
+        return RETURN_ERROR;
+      }
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
+static StateResult handle_end_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "</"));
+  switch (c) {
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_temporary_buffer(parser, output);
+    default:
+      if (is_alpha(c)) {
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
+        start_new_tag(parser, false);
+      } else {
+        add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+        clear_temporary_buffer(parser);
+        append_char_to_temporary_buffer(parser, c);
+      }
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
+static StateResult handle_tag_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      finish_tag_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+      return NEXT_CHAR;
+    case '/':
+      finish_tag_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+      return NEXT_CHAR;
+    case '>':
+      finish_tag_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_current_tag(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_TAG_EOF);
+      abandon_current_tag(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
+static StateResult handle_rcdata_lt_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "<"));
+  if (c == '/') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
+    append_char_to_temporary_buffer(parser, '/');
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+    tokenizer->_reconsume_current_input = true;
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
+static StateResult handle_rcdata_end_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "</"));
+  if (is_alpha(c)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
+    start_new_tag(parser, false);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+    return emit_temporary_buffer(parser, output);
+  }
+  return true;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
+static StateResult handle_rcdata_end_tag_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(tokenizer->_temporary_buffer.length >= 2);
+  if (is_alpha(c)) {
+    append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else if (is_appropriate_end_tag(parser)) {
+    switch (c) {
+      case '\t':
+      case '\n':
+      case '\f':
+      case ' ':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+        return NEXT_CHAR;
+      case '/':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+        return NEXT_CHAR;
+      case '>':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+        return emit_current_tag(parser, output);
+    }
+  }
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
+  abandon_current_tag(parser);
+  return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
+static StateResult handle_rawtext_lt_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "<"));
+  if (c == '/') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
+    append_char_to_temporary_buffer(parser, '/');
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+    tokenizer->_reconsume_current_input = true;
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
+static StateResult handle_rawtext_end_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "</"));
+  if (is_alpha(c)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
+    start_new_tag(parser, false);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
+static StateResult handle_rawtext_end_tag_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(tokenizer->_temporary_buffer.length >= 2);
+  gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
+             tokenizer->_tag_state._buffer.data);
+  if (is_alpha(c)) {
+    append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else if (is_appropriate_end_tag(parser)) {
+    gumbo_debug("Is an appropriate end tag.\n");
+    switch (c) {
+      case '\t':
+      case '\n':
+      case '\f':
+      case ' ':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+        return NEXT_CHAR;
+      case '/':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+        return NEXT_CHAR;
+      case '>':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+        return emit_current_tag(parser, output);
+    }
+  }
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
+  abandon_current_tag(parser);
+  return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
+static StateResult handle_script_lt_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "<"));
+  if (c == '/') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
+    append_char_to_temporary_buffer(parser, '/');
+    return NEXT_CHAR;
+  } else if (c == '!') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
+    append_char_to_temporary_buffer(parser, '!');
+    return emit_temporary_buffer(parser, output);
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+    tokenizer->_reconsume_current_input = true;
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
+static StateResult handle_script_end_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "</"));
+  if (is_alpha(c)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
+    start_new_tag(parser, false);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
+static StateResult handle_script_end_tag_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(tokenizer->_temporary_buffer.length >= 2);
+  if (is_alpha(c)) {
+    append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else if (is_appropriate_end_tag(parser)) {
+    switch (c) {
+      case '\t':
+      case '\n':
+      case '\f':
+      case ' ':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+        return NEXT_CHAR;
+      case '/':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+        return NEXT_CHAR;
+      case '>':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+        return emit_current_tag(parser, output);
+    }
+  }
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+  abandon_current_tag(parser);
+  return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
+static StateResult handle_script_escaped_start_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (c == '-') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
+    return emit_current_char(parser, output);
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+    tokenizer->_reconsume_current_input = true;
+    return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
+static StateResult handle_script_escaped_start_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (c == '-') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
+    return emit_current_char(parser, output);
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+    tokenizer->_reconsume_current_input = true;
+    return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
+static StateResult handle_script_escaped_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      return emit_eof(parser, output);
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
+static StateResult handle_script_escaped_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+    case '\0':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
+static StateResult handle_script_escaped_dash_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
+      clear_temporary_buffer(parser);
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+      return emit_current_char(parser, output);
+    case '\0':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
+static StateResult handle_script_escaped_lt_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "<"));
+  assert(!tokenizer->_script_data_buffer.length);
+  if (c == '/') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else if (is_alpha(c)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
+    append_char_to_temporary_buffer(parser, c);
+    gumbo_string_buffer_append_codepoint(
+        parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+    return emit_temporary_buffer(parser, output);
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
+static StateResult handle_script_escaped_end_tag_open_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(temporary_buffer_equals(parser, "</"));
+  if (is_alpha(c)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
+    start_new_tag(parser, false);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+    return emit_temporary_buffer(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
+static StateResult handle_script_escaped_end_tag_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(tokenizer->_temporary_buffer.length >= 2);
+  if (is_alpha(c)) {
+    append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+    append_char_to_temporary_buffer(parser, c);
+    return NEXT_CHAR;
+  } else if (is_appropriate_end_tag(parser)) {
+    switch (c) {
+      case '\t':
+      case '\n':
+      case '\f':
+      case ' ':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+        return NEXT_CHAR;
+      case '/':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+        return NEXT_CHAR;
+      case '>':
+        finish_tag_name(parser);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+        return emit_current_tag(parser, output);
+    }
+  }
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+  abandon_current_tag(parser);
+  return emit_temporary_buffer(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
+static StateResult handle_script_double_escaped_start_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+    case '/':
+    case '>':
+      gumbo_tokenizer_set_state(parser, gumbo_string_equals(
+          &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
+          ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
+      return emit_current_char(parser, output);
+    default:
+      if (is_alpha(c)) {
+        gumbo_string_buffer_append_codepoint(
+            parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+        return emit_current_char(parser, output);
+      } else {
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
+        tokenizer->_reconsume_current_input = true;
+        return NEXT_CHAR;
+      }
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
+static StateResult handle_script_double_escaped_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+      return emit_current_char(parser, output);
+    case '\0':
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
+static StateResult handle_script_double_escaped_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+      return emit_current_char(parser, output);
+    case '\0':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
+static StateResult handle_script_double_escaped_dash_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      return emit_current_char(parser, output);
+    case '<':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
+      return emit_current_char(parser, output);
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
+      return emit_current_char(parser, output);
+    case '\0':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+      return emit_replacement_char(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return NEXT_CHAR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+      return emit_current_char(parser, output);
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
+static StateResult handle_script_double_escaped_lt_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (c == '/') {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
+    gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
+    gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+    return emit_current_char(parser, output);
+  } else {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+    tokenizer->_reconsume_current_input = true;
+    return NEXT_CHAR;
+  }
+
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
+static StateResult handle_script_double_escaped_end_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+    case '/':
+    case '>':
+      gumbo_tokenizer_set_state(parser, gumbo_string_equals(
+          &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
+          ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+      return emit_current_char(parser, output);
+    default:
+      if (is_alpha(c)) {
+        gumbo_string_buffer_append_codepoint(
+            parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
+        return emit_current_char(parser, output);
+      } else {
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
+        tokenizer->_reconsume_current_input = true;
+        return NEXT_CHAR;
+      }
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
+static StateResult handle_before_attr_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '/':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_current_tag(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+      append_char_to_temporary_buffer(parser, 0xfffd);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      return NEXT_CHAR;
+    case '"':
+    case '\'':
+    case '<':
+    case '=':
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+      // Fall through.
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+      append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
+static StateResult handle_attr_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      finish_attribute_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
+      return NEXT_CHAR;
+    case '/':
+      finish_attribute_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+      return NEXT_CHAR;
+    case '=':
+      finish_attribute_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
+      return NEXT_CHAR;
+    case '>':
+      finish_attribute_name(parser);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_current_tag(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+      return NEXT_CHAR;
+    case -1:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+      return NEXT_CHAR;
+    case '"':
+    case '\'':
+    case '<':
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+      // Fall through.
+    default:
+      append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
+static StateResult handle_after_attr_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '/':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+      return NEXT_CHAR;
+    case '=':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_current_tag(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+      append_char_to_temporary_buffer(parser, 0xfffd);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      return NEXT_CHAR;
+    case '"':
+    case '\'':
+    case '<':
+      add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
+      // Fall through.
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
+      append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
+static StateResult handle_before_attr_value_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '"':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
+      reset_tag_buffer_start_point(parser);
+      return NEXT_CHAR;
+    case '&':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '\'':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
+      reset_tag_buffer_start_point(parser);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_current_tag(parser, output);
+      return RETURN_ERROR;
+    case '<':
+    case '=':
+    case '`':
+      add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
+      // Fall through.
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
+      append_char_to_tag_buffer(parser, c, true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
+static StateResult handle_attr_value_double_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '"':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
+      return NEXT_CHAR;
+    case '&':
+      tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    default:
+      append_char_to_tag_buffer(parser, c, false);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
+static StateResult handle_attr_value_single_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\'':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
+      return NEXT_CHAR;
+    case '&':
+      tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    default:
+      append_char_to_tag_buffer(parser, c, false);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
+static StateResult handle_attr_value_unquoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+      finish_attribute_value(parser);
+      return NEXT_CHAR;
+    case '&':
+      tokenizer->_tag_state._attr_value_state = tokenizer->_state;
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      finish_attribute_value(parser);
+      return emit_current_tag(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_reconsume_current_input = true;
+      abandon_current_tag(parser);
+      return NEXT_CHAR;
+    case '<':
+    case '=':
+    case '"':
+    case '\'':
+    case '`':
+      add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
+      // Fall through.
+    default:
+      append_char_to_tag_buffer(parser, c, true);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
+static StateResult handle_char_ref_in_attr_value_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  OneOrTwoCodepoints char_ref;
+  int allowed_char;
+  bool is_unquoted = false;
+  switch (tokenizer->_tag_state._attr_value_state) {
+    case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
+      allowed_char = '"';
+      break;
+    case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
+      allowed_char = '\'';
+      break;
+    case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
+      allowed_char = '>';
+      is_unquoted = true;
+      break;
+    default:
+      // -Wmaybe-uninitialized is a little overzealous here, and doesn't
+      // get that the assert(0) means this codepath will never happen.
+      allowed_char = ' ';
+      assert(0);
+  }
+
+  // Ignore the status, since we don't have a convenient way of signalling that
+  // a parser error has occurred when the error occurs in the middle of a
+  // multi-state token.  We'd need a flag inside the TokenizerState to do this,
+  // but that's a low priority fix.
+  consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
+  if (char_ref.first != kGumboNoChar) {
+    tokenizer->_reconsume_current_input = true;
+    append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
+    if (char_ref.second != kGumboNoChar) {
+      append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
+    }
+  } else {
+    append_char_to_tag_buffer(parser, '&', is_unquoted);
+  }
+  gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
+  return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
+static StateResult handle_after_attr_value_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  finish_attribute_value(parser);
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+      return NEXT_CHAR;
+    case '/':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_current_tag(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
+static StateResult handle_self_closing_start_tag_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_tag_state._is_self_closing = true;
+      return emit_current_tag(parser, output);
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      abandon_current_tag(parser);
+      return NEXT_CHAR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
+      tokenizer->_reconsume_current_input = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
+static StateResult handle_bogus_comment_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  while (c != '>' && c != -1) {
+    if (c == '\0') {
+      c = 0xFFFD;
+    }
+    append_char_to_temporary_buffer(parser, c);
+    utf8iterator_next(&tokenizer->_input);
+    c = utf8iterator_current(&tokenizer->_input);
+  }
+  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+  return emit_comment(parser, output);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
+static StateResult handle_markup_declaration_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (utf8iterator_maybe_consume_match(
+      &tokenizer->_input, "--", sizeof("--") - 1, true)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
+    tokenizer->_reconsume_current_input = true;
+  } else if (utf8iterator_maybe_consume_match(
+      &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
+    tokenizer->_reconsume_current_input = true;
+    // If we get here, we know we'll eventually emit a doctype token, so now is
+    // the time to initialize the doctype strings.  (Not in doctype_state_init,
+    // since then they'll leak if ownership never gets transferred to the
+    // doctype token.
+    tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
+    tokenizer->_doc_type_state.public_identifier =
+        gumbo_copy_stringz(parser, "");
+    tokenizer->_doc_type_state.system_identifier =
+        gumbo_copy_stringz(parser, "");
+  } else if (tokenizer->_is_current_node_foreign &&
+             utf8iterator_maybe_consume_match(
+                &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
+    tokenizer->_reconsume_current_input = true;
+  } else {
+    add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
+    tokenizer->_reconsume_current_input = true;
+    clear_temporary_buffer(parser);
+  }
+  return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
+static StateResult handle_comment_start_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
+static StateResult handle_comment_start_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
+static StateResult handle_comment_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
+static StateResult handle_comment_end_dash_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
+static StateResult handle_comment_end_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_comment(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '!':
+      add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
+      return NEXT_CHAR;
+    case '-':
+      add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
+      append_char_to_temporary_buffer(parser, '-');
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
+static StateResult handle_comment_end_bang_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '-':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '!');
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      return emit_comment(parser, output);
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '!');
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_comment(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '-');
+      append_char_to_temporary_buffer(parser, '!');
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
+static StateResult handle_doctype_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  assert(!tokenizer->_temporary_buffer.length);
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
+      tokenizer->_reconsume_current_input = true;
+      tokenizer->_doc_type_state.force_quirks = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
+static StateResult handle_before_doctype_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+      tokenizer->_doc_type_state.force_quirks = true;
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+      tokenizer->_doc_type_state.force_quirks = false;
+      append_char_to_temporary_buffer(parser, ensure_lowercase(c));
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
+static StateResult handle_doctype_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
+      gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+      finish_temporary_buffer(
+          parser, &tokenizer->_doc_type_state.name);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+      finish_temporary_buffer(
+          parser, &tokenizer->_doc_type_state.name);
+      emit_doctype(parser, output);
+      return RETURN_SUCCESS;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
+      finish_temporary_buffer(
+          parser, &tokenizer->_doc_type_state.name);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
+      tokenizer->_doc_type_state.force_quirks = false;
+      append_char_to_temporary_buffer(parser, ensure_lowercase(c));
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
+static StateResult handle_after_doctype_name_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_doctype(parser, output);
+      return RETURN_SUCCESS;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      if (utf8iterator_maybe_consume_match(
+          &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
+        gumbo_tokenizer_set_state(
+            parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
+        tokenizer->_reconsume_current_input = true;
+      } else if (utf8iterator_maybe_consume_match(
+          &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
+        gumbo_tokenizer_set_state(
+            parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
+        tokenizer->_reconsume_current_input = true;
+      } else {
+        add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
+        gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+        tokenizer->_doc_type_state.force_quirks = true;
+      }
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
+static StateResult handle_after_doctype_public_keyword_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
+      return NEXT_CHAR;
+    case '"':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
+static StateResult handle_before_doctype_public_id_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '"':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
+static StateResult handle_doctype_public_id_double_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '"':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
+      finish_doctype_public_id(parser);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_public_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_public_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
+static StateResult handle_doctype_public_id_single_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\'':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
+      finish_doctype_public_id(parser);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_public_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_public_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
+static StateResult handle_after_doctype_public_id_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_doctype(parser, output);
+      return RETURN_SUCCESS;
+    case '"':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_reconsume_current_input = true;
+      tokenizer->_doc_type_state.force_quirks = true;
+      return NEXT_CHAR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
+static StateResult handle_between_doctype_public_system_id_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_doctype(parser, output);
+      return RETURN_SUCCESS;
+    case '"':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
+static StateResult handle_after_doctype_system_keyword_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
+      return NEXT_CHAR;
+    case '"':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
+static StateResult handle_before_doctype_system_id_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '"':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
+      return NEXT_CHAR;
+    case '\'':
+      assert(temporary_buffer_equals(parser, ""));
+      gumbo_tokenizer_set_state(
+          parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      tokenizer->_doc_type_state.force_quirks = true;
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
+static StateResult handle_doctype_system_id_double_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '"':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
+      finish_doctype_system_id(parser);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_system_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_system_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
+static StateResult handle_doctype_system_id_single_quoted_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\'':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
+      finish_doctype_system_id(parser);
+      return NEXT_CHAR;
+    case '\0':
+      add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
+      append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
+      return NEXT_CHAR;
+    case '>':
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_system_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      finish_doctype_system_id(parser);
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      append_char_to_temporary_buffer(parser, c);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
+static StateResult handle_after_doctype_system_id_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  switch (c) {
+    case '\t':
+    case '\n':
+    case '\f':
+    case ' ':
+      return NEXT_CHAR;
+    case '>':
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      emit_doctype(parser, output);
+      return RETURN_SUCCESS;
+    case -1:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+      tokenizer->_doc_type_state.force_quirks = true;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
+    default:
+      add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
+      gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
+      return NEXT_CHAR;
+  }
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
+static StateResult handle_bogus_doctype_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (c == '>' || c == -1) {
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+    emit_doctype(parser, output);
+    return RETURN_ERROR;
+  }
+  return NEXT_CHAR;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
+static StateResult handle_cdata_state(
+    GumboParser* parser, GumboTokenizerState* tokenizer,
+    int c, GumboToken* output) {
+  if (c == -1 || utf8iterator_maybe_consume_match(
+      &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
+    tokenizer->_reconsume_current_input = true;
+    reset_token_start_point(tokenizer);
+    gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+    return NEXT_CHAR;
+  } else {
+    return emit_current_char(parser, output);
+  }
+}
+
+typedef StateResult (*GumboLexerStateFunction)(
+    GumboParser*, GumboTokenizerState*, int, GumboToken*);
+
+static GumboLexerStateFunction dispatch_table[] = {
+  handle_data_state,
+  handle_char_ref_in_data_state,
+  handle_rcdata_state,
+  handle_char_ref_in_rcdata_state,
+  handle_rawtext_state,
+  handle_script_state,
+  handle_plaintext_state,
+  handle_tag_open_state,
+  handle_end_tag_open_state,
+  handle_tag_name_state,
+  handle_rcdata_lt_state,
+  handle_rcdata_end_tag_open_state,
+  handle_rcdata_end_tag_name_state,
+  handle_rawtext_lt_state,
+  handle_rawtext_end_tag_open_state,
+  handle_rawtext_end_tag_name_state,
+  handle_script_lt_state,
+  handle_script_end_tag_open_state,
+  handle_script_end_tag_name_state,
+  handle_script_escaped_start_state,
+  handle_script_escaped_start_dash_state,
+  handle_script_escaped_state,
+  handle_script_escaped_dash_state,
+  handle_script_escaped_dash_dash_state,
+  handle_script_escaped_lt_state,
+  handle_script_escaped_end_tag_open_state,
+  handle_script_escaped_end_tag_name_state,
+  handle_script_double_escaped_start_state,
+  handle_script_double_escaped_state,
+  handle_script_double_escaped_dash_state,
+  handle_script_double_escaped_dash_dash_state,
+  handle_script_double_escaped_lt_state,
+  handle_script_double_escaped_end_state,
+  handle_before_attr_name_state,
+  handle_attr_name_state,
+  handle_after_attr_name_state,
+  handle_before_attr_value_state,
+  handle_attr_value_double_quoted_state,
+  handle_attr_value_single_quoted_state,
+  handle_attr_value_unquoted_state,
+  handle_char_ref_in_attr_value_state,
+  handle_after_attr_value_quoted_state,
+  handle_self_closing_start_tag_state,
+  handle_bogus_comment_state,
+  handle_markup_declaration_state,
+  handle_comment_start_state,
+  handle_comment_start_dash_state,
+  handle_comment_state,
+  handle_comment_end_dash_state,
+  handle_comment_end_state,
+  handle_comment_end_bang_state,
+  handle_doctype_state,
+  handle_before_doctype_name_state,
+  handle_doctype_name_state,
+  handle_after_doctype_name_state,
+  handle_after_doctype_public_keyword_state,
+  handle_before_doctype_public_id_state,
+  handle_doctype_public_id_double_quoted_state,
+  handle_doctype_public_id_single_quoted_state,
+  handle_after_doctype_public_id_state,
+  handle_between_doctype_public_system_id_state,
+  handle_after_doctype_system_keyword_state,
+  handle_before_doctype_system_id_state,
+  handle_doctype_system_id_double_quoted_state,
+  handle_doctype_system_id_single_quoted_state,
+  handle_after_doctype_system_id_state,
+  handle_bogus_doctype_state,
+  handle_cdata_state
+};
+
+bool gumbo_lex(GumboParser* parser, GumboToken* output) {
+  // Because of the spec requirements that...
+  //
+  // 1. Tokens be handled immediately by the parser upon emission.
+  // 2. Some states (eg. CDATA, or various error conditions) require the
+  // emission of multiple tokens in the same states.
+  // 3. The tokenizer often has to reconsume the same character in a different
+  // state.
+  //
+  // ...all state must be held in the GumboTokenizer struct instead of in local
+  // variables in this function.  That allows us to return from this method with
+  // a token, and then immediately jump back to the same state with the same
+  // input if we need to return a different token.  The various emit_* functions
+  // are responsible for changing state (eg. flushing the chardata buffer,
+  // reading the next input character) to avoid an infinite loop.
+  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
+
+  if (tokenizer->_buffered_emit_char != kGumboNoChar) {
+    tokenizer->_reconsume_current_input = true;
+    emit_char(parser, tokenizer->_buffered_emit_char, output);
+    // And now that we've avoided advancing the input, make sure we set
+    // _reconsume_current_input back to false to make sure the *next* character
+    // isn't consumed twice.
+    tokenizer->_reconsume_current_input = false;
+    tokenizer->_buffered_emit_char = kGumboNoChar;
+    return true;
+  }
+
+  if (maybe_emit_from_temporary_buffer(parser, output)) {
+    return true;
+  }
+
+  while (1) {
+    assert(!tokenizer->_temporary_buffer_emit);
+    assert(tokenizer->_buffered_emit_char == kGumboNoChar);
+    int c = utf8iterator_current(&tokenizer->_input);
+    gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
+    StateResult result =
+        dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
+    // We need to clear reconsume_current_input before returning to prevent
+    // certain infinite loop states.
+    bool should_advance = !tokenizer->_reconsume_current_input;
+    tokenizer->_reconsume_current_input = false;
+
+    if (result == RETURN_SUCCESS) {
+      return true;
+    } else if(result == RETURN_ERROR) {
+      return false;
+    }
+
+    if (should_advance) {
+      utf8iterator_next(&tokenizer->_input);
+    }
+  }
+}
+
+void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
+  if (!token) return;
+
+  switch (token->type) {
+    case GUMBO_TOKEN_DOCTYPE:
+      gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
+      gumbo_parser_deallocate(
+          parser, (void*) token->v.doc_type.public_identifier);
+      gumbo_parser_deallocate(
+          parser, (void*) token->v.doc_type.system_identifier);
+      return;
+    case GUMBO_TOKEN_START_TAG:
+      for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
+        GumboAttribute* attr = token->v.start_tag.attributes.data[i];
+        if (attr) {
+          // May have been nulled out if this token was merged with another.
+          gumbo_destroy_attribute(parser, attr);
+        }
+      }
+      gumbo_parser_deallocate(
+          parser, (void*) token->v.start_tag.attributes.data);
+      return;
+    case GUMBO_TOKEN_COMMENT:
+      gumbo_parser_deallocate(parser, (void*) token->v.text);
+      return;
+    default:
+      return;
+  }
+}

+ 123 - 0
gumbo/tokenizer.h

@@ -0,0 +1,123 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// This contains an implementation of a tokenizer for HTML5.  It consumes a
+// buffer of UTF-8 characters, and then emits a stream of tokens.
+
+#ifndef GUMBO_TOKENIZER_H_
+#define GUMBO_TOKENIZER_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "gumbo.h"
+#include "token_type.h"
+#include "tokenizer_states.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalParser;
+
+// Struct containing all information pertaining to doctype tokens.
+typedef struct GumboInternalTokenDocType {
+  const char* name;
+  const char* public_identifier;
+  const char* system_identifier;
+  bool force_quirks;
+  // There's no way to tell a 0-length public or system ID apart from the
+  // absence of a public or system ID, but they're handled different by the
+  // spec, so we need bool flags for them.
+  bool has_public_identifier;
+  bool has_system_identifier;
+} GumboTokenDocType;
+
+// Struct containing all information pertaining to start tag tokens.
+typedef struct GumboInternalTokenStartTag {
+  GumboTag tag;
+  GumboVector /* GumboAttribute */ attributes;
+  bool is_self_closing;
+} GumboTokenStartTag;
+
+// A data structure representing a single token in the input stream.  This
+// contains an enum for the type, the source position, a GumboStringPiece
+// pointing to the original text, and then a union for any parsed data.
+typedef struct GumboInternalToken {
+  GumboTokenType type;
+  GumboSourcePosition position;
+  GumboStringPiece original_text;
+  union {
+    GumboTokenDocType doc_type;
+    GumboTokenStartTag start_tag;
+    GumboTag end_tag;
+    const char* text;    // For comments.
+    int character;      // For character, whitespace, null, and EOF tokens.
+  } v;
+} GumboToken;
+
+// Initializes the tokenizer state within the GumboParser object, setting up a
+// parse of the specified text.
+void gumbo_tokenizer_state_init(
+    struct GumboInternalParser* parser, const char* text, size_t text_length);
+
+// Destroys the tokenizer state within the GumboParser object, freeing any
+// dynamically-allocated structures within it.
+void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
+
+// Sets the tokenizer state to the specified value.  This is needed by some
+// parser states, which alter the state of the tokenizer in response to tags
+// seen.
+void gumbo_tokenizer_set_state(
+    struct GumboInternalParser* parser, GumboTokenizerEnum state);
+
+// Flags whether the current node is a foreign content element.  This is
+// necessary for the markup declaration open state, where the tokenizer must be
+// aware of the state of the parser to properly tokenize bad comment tags.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
+void gumbo_tokenizer_set_is_current_node_foreign(
+    struct GumboInternalParser* parser, bool is_foreign);
+
+// Lexes a single token from the specified buffer, filling the output with the
+// parsed GumboToken data structure.  Returns true for a successful
+// tokenization, false if a parse error occurs.
+//
+// Example:
+//   struct GumboInternalParser parser;
+//   GumboToken output;
+//   gumbo_tokenizer_state_init(&parser, text, strlen(text));
+//   while (gumbo_lex(&parser, &output)) {
+//     ...do stuff with output.
+//     gumbo_token_destroy(&parser, &token);
+//   }
+//   gumbo_tokenizer_state_destroy(&parser);
+bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
+
+// Frees the internally-allocated pointers within an GumboToken.  Note that this
+// doesn't free the token itself, since oftentimes it will be allocated on the
+// stack.  A simple call to free() (or GumboParser->deallocator, if
+// appropriate) can handle that.
+//
+// Note that if you are handing over ownership of the internal strings to some
+// other data structure - for example, a parse tree - these do not need to be
+// freed.
+void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_TOKENIZER_H_

+ 103 - 0
gumbo/tokenizer_states.h

@@ -0,0 +1,103 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// This contains the list of states used in the tokenizer.  Although at first
+// glance it seems like these could be kept internal to the tokenizer, several
+// of the actions in the parser require that it reach into the tokenizer and
+// reset the tokenizer state.  For that to work, it needs to have the
+// definitions of individual states available.
+//
+// This may also be useful for providing more detailed error messages for parse
+// errors, as we can match up states and inputs in a table without having to
+// clutter the tokenizer code with lots of precise error messages.
+
+#ifndef GUMBO_TOKENIZER_STATES_H_
+#define GUMBO_TOKENIZER_STATES_H_
+
+// The ordering of this enum is also used to build the dispatch table for the
+// tokenizer state machine, so if it is changed, be sure to update that too.
+typedef enum {
+  GUMBO_LEX_DATA,
+  GUMBO_LEX_CHAR_REF_IN_DATA,
+  GUMBO_LEX_RCDATA,
+  GUMBO_LEX_CHAR_REF_IN_RCDATA,
+  GUMBO_LEX_RAWTEXT,
+  GUMBO_LEX_SCRIPT,
+  GUMBO_LEX_PLAINTEXT,
+  GUMBO_LEX_TAG_OPEN,
+  GUMBO_LEX_END_TAG_OPEN,
+  GUMBO_LEX_TAG_NAME,
+  GUMBO_LEX_RCDATA_LT,
+  GUMBO_LEX_RCDATA_END_TAG_OPEN,
+  GUMBO_LEX_RCDATA_END_TAG_NAME,
+  GUMBO_LEX_RAWTEXT_LT,
+  GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
+  GUMBO_LEX_RAWTEXT_END_TAG_NAME,
+  GUMBO_LEX_SCRIPT_LT,
+  GUMBO_LEX_SCRIPT_END_TAG_OPEN,
+  GUMBO_LEX_SCRIPT_END_TAG_NAME,
+  GUMBO_LEX_SCRIPT_ESCAPED_START,
+  GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
+  GUMBO_LEX_SCRIPT_ESCAPED,
+  GUMBO_LEX_SCRIPT_ESCAPED_DASH,
+  GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
+  GUMBO_LEX_SCRIPT_ESCAPED_LT,
+  GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
+  GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
+  GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
+  GUMBO_LEX_BEFORE_ATTR_NAME,
+  GUMBO_LEX_ATTR_NAME,
+  GUMBO_LEX_AFTER_ATTR_NAME,
+  GUMBO_LEX_BEFORE_ATTR_VALUE,
+  GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
+  GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
+  GUMBO_LEX_ATTR_VALUE_UNQUOTED,
+  GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
+  GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
+  GUMBO_LEX_SELF_CLOSING_START_TAG,
+  GUMBO_LEX_BOGUS_COMMENT,
+  GUMBO_LEX_MARKUP_DECLARATION,
+  GUMBO_LEX_COMMENT_START,
+  GUMBO_LEX_COMMENT_START_DASH,
+  GUMBO_LEX_COMMENT,
+  GUMBO_LEX_COMMENT_END_DASH,
+  GUMBO_LEX_COMMENT_END,
+  GUMBO_LEX_COMMENT_END_BANG,
+  GUMBO_LEX_DOCTYPE,
+  GUMBO_LEX_BEFORE_DOCTYPE_NAME,
+  GUMBO_LEX_DOCTYPE_NAME,
+  GUMBO_LEX_AFTER_DOCTYPE_NAME,
+  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
+  GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
+  GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
+  GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
+  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
+  GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
+  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
+  GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
+  GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
+  GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
+  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
+  GUMBO_LEX_BOGUS_DOCTYPE,
+  GUMBO_LEX_CDATA
+} GumboTokenizerEnum;
+
+#endif  // GUMBO_TOKENIZER_STATES_H_

+ 269 - 0
gumbo/utf8.c

@@ -0,0 +1,269 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "utf8.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>    // For strncasecmp.
+
+#include "error.h"
+#include "gumbo.h"
+#include "parser.h"
+#include "util.h"
+#include "vector.h"
+
+const int kUtf8ReplacementChar = 0xFFFD;
+
+// Reference material:
+// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
+// RFC 3629: http://tools.ietf.org/html/rfc3629
+// HTML5 Unicode handling:
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
+
+// Adds a decoding error to the parser's error list, based on the current state
+// of the Utf8Iterator.
+static void add_error(Utf8Iterator* iter, GumboErrorType type) {
+  GumboParser* parser = iter->_parser;
+
+  GumboError* error = gumbo_add_error(parser);
+  if (!error) {
+    return;
+  }
+  error->type = type;
+  error->position = iter->_pos;
+  error->original_text = iter->_start;
+
+  // At the point the error is recorded, the code point hasn't been computed
+  // yet (and can't be, because it's invalid), so we need to build up the raw
+  // hex value from the bytes under the cursor.
+  uint64_t code_point = 0;
+  for (int i = 0; i < iter->_width; ++i) {
+    code_point = (code_point << 8) | (unsigned char) iter->_start[i];
+  }
+  error->v.codepoint = code_point;
+}
+
+// Reads the next UTF-8 character in the iter.
+// This assumes that iter->_start points to the beginning of the character.
+// When this method returns, iter->_width and iter->_current will be set
+// appropriately, as well as any error flags.
+static void read_char(Utf8Iterator* iter) {
+  unsigned char c;
+  unsigned char mask = '\0';
+  int is_bad_char = false;
+
+  c = (unsigned char) *iter->_start;
+  if (c < 0x80) {
+    // Valid one-byte sequence.
+    iter->_width = 1;
+    mask = 0xFF;
+  } else if (c < 0xC0) {
+    // Continuation character not following a multibyte sequence.
+    // The HTML5 spec here says to consume the byte and output a replacement
+    // character.
+    iter->_width = 1;
+    is_bad_char = true;
+  } else if (c < 0xE0) {
+    iter->_width = 2;
+    mask = 0x1F;                // 00011111 in binary.
+    if (c < 0xC2) {
+      // Overlong encoding; error according to UTF8/HTML5 spec.
+      is_bad_char = true;
+    }
+  } else if (c < 0xF0) {
+    iter->_width = 3;
+    mask = 0xF;                 // 00001111 in binary.
+  } else if (c < 0xF5) {
+    iter->_width = 4;
+    mask = 0x7;                 // 00000111 in binary.
+  } else if (c < 0xF8) {
+    // The following cases are all errors, but we need to handle them separately
+    // so that we consume the proper number of bytes from the input stream
+    // before replacing them with the replacement char.  The HTML5 spec
+    // specifies that we should consume the shorter of the length specified by
+    // the first bit or the run leading up to the first non-continuation
+    // character.
+    iter->_width = 5;
+    is_bad_char = true;
+  } else if (c < 0xFC) {
+    iter->_width = 6;
+    is_bad_char = true;
+  } else if (c < 0xFE) {
+    iter->_width = 7;
+    is_bad_char = true;
+  } else {
+    iter->_width = 1;
+    is_bad_char = true;
+  }
+
+  // Check to make sure we have enough bytes left in the iter to read all that
+  // we want.  If not, we set the iter_truncated flag, mark this as a bad
+  // character, and adjust the current width so that it consumes the rest of the
+  // iter.
+  uint64_t code_point = c & mask;
+  if (iter->_start + iter->_width > iter->_end) {
+    iter->_width = iter->_end - iter->_start;
+    add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
+    is_bad_char = true;
+  }
+
+  // Now we decode continuation bytes, shift them appropriately, and build up
+  // the appropriate code point.
+  assert(iter->_width < 8);
+  for (int i = 1; i < iter->_width; ++i) {
+    c = (unsigned char) iter->_start[i];
+    if (c < 0x80 || c > 0xBF) {
+      // Per HTML5 spec, we don't include the invalid continuation char in the
+      // run that we consume here.
+      iter->_width = i;
+      is_bad_char = true;
+      break;
+    }
+    code_point = (code_point << 6) | (c & ~0x80);
+  }
+  if (code_point > 0x10FFFF) is_bad_char = true;
+
+  // If we had a decode error, set the current code point to the replacement
+  // character and flip the flag indicating that a decode error occurred.
+  // Ditto if we have a code point that is explicitly on the list of characters
+  // prohibited by the HTML5 spec, such as control characters.
+  if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
+    add_error(iter, GUMBO_ERR_UTF8_INVALID);
+    code_point = kUtf8ReplacementChar;
+  }
+
+  // This is the special handling for carriage returns that is mandated by the
+  // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
+  // we operate in terms of chars and only need a check for iter overrun,
+  // instead of having to read in a full next code point.
+  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
+  if (code_point == '\r') {
+    const char* next = iter->_start + iter->_width;
+    if (next < iter->_end && *next == '\n') {
+      // Advance the iter, as if the carriage return didn't exist.
+      ++iter->_start;
+      // Preserve the true offset, since other tools that look at it may be
+      // unaware of HTML5's rules for converting \r into \n.
+      ++iter->_pos.offset;
+    }
+    code_point = '\n';
+  }
+
+  // At this point, we know we have a valid character as the code point, so we
+  // set it, and we're done.
+  iter->_current = code_point;
+}
+
+static void update_position(Utf8Iterator* iter) {
+  iter->_pos.offset += iter->_width;
+  if (iter->_current == '\n') {
+    ++iter->_pos.line;
+    iter->_pos.column = 1;
+  } else if(iter->_current == '\t') {
+    int tab_stop = iter->_parser->_options->tab_stop;
+    iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
+  } else {
+    ++iter->_pos.column;
+  }
+}
+
+// Returns true if this Unicode code point is in the list of characters
+// forbidden by the HTML5 spec, such as undefined control chars.
+bool utf8_is_invalid_code_point(int c) {
+  return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
+      (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
+      ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
+}
+
+void utf8iterator_init(
+    GumboParser* parser, const char* source, size_t source_length,
+    Utf8Iterator* iter) {
+  iter->_start = source;
+  iter->_end = source + source_length;
+  iter->_width = 0;
+  iter->_pos.line = 1;
+  iter->_pos.column = 1;
+  iter->_pos.offset = 0;
+  iter->_parser = parser;
+  if (source_length) {
+    read_char(iter);
+  } else {
+    iter->_current = -1;
+  }
+}
+
+void utf8iterator_next(Utf8Iterator* iter) {
+  iter->_start += iter->_width;
+  // We update positions based on the *last* character read, so that the first
+  // character following a newline is at column 1 in the next line.
+  update_position(iter);
+  if (iter->_start < iter->_end) {
+    read_char(iter);
+  } else {  // EOF
+    iter->_current = -1;
+  }
+}
+
+int utf8iterator_current(const Utf8Iterator* iter) {
+  return iter->_current;
+}
+
+void utf8iterator_get_position(
+    const Utf8Iterator* iter, GumboSourcePosition* output) {
+  *output = iter->_pos;
+}
+
+const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
+  return iter->_start;
+}
+
+bool utf8iterator_maybe_consume_match(
+    Utf8Iterator* iter, const char* prefix, size_t length,
+    bool case_sensitive) {
+  bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ?
+      !strncmp(iter->_start, prefix, length) :
+      !strncasecmp(iter->_start, prefix, length));
+  if (matched) {
+    for (int i = 0; i < length; ++i) {
+      utf8iterator_next(iter);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void utf8iterator_mark(Utf8Iterator* iter) {
+  iter->_mark = iter->_start;
+  iter->_mark_pos = iter->_pos;
+}
+
+// Returns the current input stream position to the mark.
+void utf8iterator_reset(Utf8Iterator* iter) {
+  iter->_start = iter->_mark;
+  iter->_pos = iter->_mark_pos;
+  read_char(iter);
+}
+
+// Sets the position and original text fields of an error to the value at the
+// mark.
+void utf8iterator_fill_error_at_mark(
+    Utf8Iterator* iter, GumboError* error) {
+  error->position = iter->_mark_pos;
+  error->original_text = iter->_mark;
+}

+ 127 - 0
gumbo/utf8.h

@@ -0,0 +1,127 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// This contains an implementation of a UTF8 iterator and decoder suitable for
+// an HTML5 parser.  This does a bit more than straight UTF-8 decoding.  The
+// HTML5 spec specifies that:
+// 1. Decoding errors are parse errors.
+// 2. Certain other codepoints (eg. control characters) are parse errors.
+// 3. Carriage returns and CR/LF groups are converted to line feeds.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
+//
+// Also, we want to keep track of source positions for error handling.  As a
+// result, we fold all that functionality into this decoder, and can't use an
+// off-the-shelf library.
+//
+// This header is internal-only, which is why we prefix functions with only
+// utf8_ or utf8_iterator_ instead of gumbo_utf8_.
+
+#ifndef GUMBO_UTF8_H_
+#define GUMBO_UTF8_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "gumbo.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct GumboInternalError;
+struct GumboInternalParser;
+
+// Unicode replacement char.
+extern const int kUtf8ReplacementChar;
+
+typedef struct GumboInternalUtf8Iterator {
+  // Points at the start of the code point most recently read into 'current'.
+  const char* _start;
+
+  // Points at the mark.  The mark is initially set to the beginning of the
+  // input.
+  const char* _mark;
+
+  // Points past the end of the iter, like a past-the-end iterator in the STL.
+  const char* _end;
+
+  // The code point under the cursor.
+  int _current;
+
+  // The width in bytes of the current code point.
+  int _width;
+
+  // The SourcePosition for the current location.
+  GumboSourcePosition _pos;
+
+  // The SourcePosition for the mark.
+  GumboSourcePosition _mark_pos;
+
+  // Pointer back to the GumboParser instance, for configuration options and
+  // error recording.
+  struct GumboInternalParser* _parser;
+} Utf8Iterator;
+
+// Returns true if this Unicode code point is in the list of characters
+// forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
+bool utf8_is_invalid_code_point(int c);
+
+// Initializes a new Utf8Iterator from the given byte buffer.  The source does
+// not have to be NUL-terminated, but the length must be passed in explicitly.
+void utf8iterator_init(
+    struct GumboInternalParser* parser, const char* source,
+    size_t source_length, Utf8Iterator* iter);
+
+// Advances the current position by one code point.
+void utf8iterator_next(Utf8Iterator* iter);
+
+// Returns the current code point as an integer.
+int utf8iterator_current(const Utf8Iterator* iter);
+
+// Retrieves and fills the output parameter with the current source position.
+void utf8iterator_get_position(
+    const Utf8Iterator* iter, GumboSourcePosition* output);
+
+// Retrieves a character pointer to the start of the current character.
+const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
+
+// If the upcoming text in the buffer matches the specified prefix (which has
+// length 'length'), consume it and return true.  Otherwise, return false with
+// no other effects.  If the length of the string would overflow the buffer,
+// this returns false.  Note that prefix should not contain null bytes because
+// of the use of strncmp/strncasecmp internally.  All existing use-cases adhere
+// to this.
+bool utf8iterator_maybe_consume_match(
+    Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
+
+// "Marks" a particular location of interest in the input stream, so that it can
+// later be reset() to.  There's also the ability to record an error at the
+// point that was marked, as oftentimes that's more useful than the last
+// character before the error was detected.
+void utf8iterator_mark(Utf8Iterator* iter);
+
+// Returns the current input stream position to the mark.
+void utf8iterator_reset(Utf8Iterator* iter);
+
+// Sets the position and original text fields of an error to the value at the
+// mark.
+void utf8iterator_fill_error_at_mark(
+    Utf8Iterator* iter, struct GumboInternalError* error);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // GUMBO_UTF8_H_

+ 58 - 0
gumbo/util.c

@@ -0,0 +1,58 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "util.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "gumbo.h"
+#include "parser.h"
+
+// TODO(jdtang): This should be elsewhere, but there's no .c file for
+// SourcePositions and yet the constant needs some linkage, so this is as good
+// as any.
+const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 };
+
+void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
+  return parser->_options->allocator(parser->_options->userdata, num_bytes);
+}
+
+void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
+  return parser->_options->deallocator(parser->_options->userdata, ptr);
+}
+
+char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
+  char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
+  strcpy(buffer, str);
+  return buffer;
+}
+
+// Debug function to trace operation of the parser.  Pass --copts=-DGUMBO_DEBUG
+// to use.
+void gumbo_debug(const char* format, ...) {
+#ifdef GUMBO_DEBUG
+  va_list args;
+  va_start(args, format);
+  vprintf(format, args);
+  va_end(args);
+  fflush(stdout);
+#endif
+}

+ 58 - 0
gumbo/util.h

@@ -0,0 +1,58 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+//
+// This contains some utility functions that didn't fit into any of the other
+// headers.
+
+#ifndef GUMBO_UTIL_H_
+#define GUMBO_UTIL_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declaration since it's passed into some of the functions in this
+// header.
+struct GumboInternalParser;
+
+// Utility function for allocating & copying a null-terminated string into a
+// freshly-allocated buffer.  This is necessary for proper memory management; we
+// have the convention that all const char* in parse tree structures are
+// freshly-allocated, so if we didn't copy, we'd try to delete a literal string
+// when the parse tree is destroyed.
+char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str);
+
+// Allocate a chunk of memory, using the allocator specified in the Parser's
+// config options.
+void* gumbo_parser_allocate(
+    struct GumboInternalParser* parser, size_t num_bytes);
+
+// Deallocate a chunk of memory, using the deallocator specified in the Parser's
+// config options.
+void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr);
+
+// Debug wrapper for printf, to make it easier to turn off debugging info when
+// required.
+void gumbo_debug(const char* format, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_UTIL_H_

+ 123 - 0
gumbo/vector.c

@@ -0,0 +1,123 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#include "vector.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include "util.h"
+
+struct GumboInternalParser;
+
+const GumboVector kGumboEmptyVector = { NULL, 0, 0 };
+
+void gumbo_vector_init(
+    struct GumboInternalParser* parser, size_t initial_capacity, GumboVector* vector) {
+  vector->length = 0;
+  vector->capacity = initial_capacity;
+  if (initial_capacity > 0) {
+    vector->data = gumbo_parser_allocate(
+        parser, sizeof(void*) * initial_capacity);
+  } else {
+    vector->data = NULL;
+  }
+}
+
+void gumbo_vector_destroy(struct GumboInternalParser* parser, GumboVector* vector) {
+  if (vector->capacity > 0) {
+    gumbo_parser_deallocate(parser, vector->data);
+  }
+}
+
+static void enlarge_vector_if_full(
+    struct GumboInternalParser* parser, GumboVector* vector) {
+  if (vector->length >= vector->capacity) {
+    if (vector->capacity) {
+      size_t old_num_bytes = sizeof(void*) * vector->capacity;
+      vector->capacity *= 2;
+      size_t num_bytes = sizeof(void*) * vector->capacity;
+      void** temp = gumbo_parser_allocate(parser, num_bytes);
+      memcpy(temp, vector->data, old_num_bytes);
+      gumbo_parser_deallocate(parser, vector->data);
+      vector->data = temp;
+    } else {
+      // 0-capacity vector; no previous array to deallocate.
+      vector->capacity = 2;
+      vector->data = gumbo_parser_allocate(
+          parser, sizeof(void*) * vector->capacity);
+    }
+  }
+}
+
+void gumbo_vector_add(
+    struct GumboInternalParser* parser, void* element, GumboVector* vector) {
+  enlarge_vector_if_full(parser, vector);
+  assert(vector->data);
+  assert(vector->length < vector->capacity);
+  vector->data[vector->length++] = element;
+}
+
+void* gumbo_vector_pop(
+    struct GumboInternalParser* parser, GumboVector* vector) {
+  if (vector->length == 0) {
+    return NULL;
+  }
+  return vector->data[--vector->length];
+}
+
+int gumbo_vector_index_of(GumboVector* vector, void* element) {
+  for (int i = 0; i < vector->length; ++i) {
+    if (vector->data[i] == element) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+void gumbo_vector_insert_at(
+    struct GumboInternalParser* parser, void* element, int index,
+    GumboVector* vector) {
+  assert(index >= 0);
+  assert(index <= vector->length);
+  enlarge_vector_if_full(parser, vector);
+  ++vector->length;
+  memmove(&vector->data[index + 1], &vector->data[index],
+          sizeof(void*) * (vector->length - index - 1));
+  vector->data[index] = element;
+}
+
+void gumbo_vector_remove(
+    struct GumboInternalParser* parser, void* node, GumboVector* vector) {
+  int index = gumbo_vector_index_of(vector, node);
+  if (index == -1) {
+    return;
+  }
+  gumbo_vector_remove_at(parser, index, vector);
+}
+
+void* gumbo_vector_remove_at(
+    struct GumboInternalParser* parser, int index, GumboVector* vector) {
+  assert(index >= 0);
+  assert(index < vector->length);
+  void* result = vector->data[index];
+  memmove(&vector->data[index], &vector->data[index + 1],
+          sizeof(void*) * (vector->length - index - 1));
+  --vector->length;
+  return result;
+}

+ 69 - 0
gumbo/vector.h

@@ -0,0 +1,69 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: [email protected] (Jonathan Tang)
+
+#ifndef GUMBO_VECTOR_H_
+#define GUMBO_VECTOR_H_
+
+#include "gumbo.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declaration since it's passed into some of the functions in this
+// header.
+struct GumboInternalParser;
+
+// Initializes a new GumboVector with the specified initial capacity.
+void gumbo_vector_init(
+    struct GumboInternalParser* parser, size_t initial_capacity,
+    GumboVector* vector);
+
+// Frees the memory used by an GumboVector.  Does not free the contained
+// pointers.
+void gumbo_vector_destroy(
+    struct GumboInternalParser* parser, GumboVector* vector);
+
+// Adds a new element to an GumboVector.
+void gumbo_vector_add(
+    struct GumboInternalParser* parser, void* element, GumboVector* vector);
+
+// Removes and returns the element most recently added to the GumboVector.
+// Ownership is transferred to caller.  Capacity is unchanged.  If the vector is
+// empty, NULL is returned.
+void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
+
+// Inserts an element at a specific index.  This is potentially O(N) time, but
+// is necessary for some of the spec's behavior.
+void gumbo_vector_insert_at(
+    struct GumboInternalParser* parser, void* element, int index,
+    GumboVector* vector);
+
+// Removes an element from the vector, or does nothing if the element is not in
+// the vector.
+void gumbo_vector_remove(
+    struct GumboInternalParser* parser, void* element, GumboVector* vector);
+
+// Removes and returns an element at a specific index.  Note that this is
+// potentially O(N) time and should be used sparingly.
+void* gumbo_vector_remove_at(
+    struct GumboInternalParser* parser, int index, GumboVector* vector);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // GUMBO_VECTOR_H_