parser.c 172 KB


  1. // Copyright 2010 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: [email protected] (Jonathan Tang)
  16. #include <assert.h>
  17. #include <ctype.h>
  18. #include <stdarg.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #include <strings.h>
  22. #include "attribute.h"
  23. #include "error.h"
  24. #include "gumbo.h"
  25. #include "insertion_mode.h"
  26. #include "parser.h"
  27. #include "tokenizer.h"
  28. #include "tokenizer_states.h"
  29. #include "utf8.h"
  30. #include "util.h"
  31. #include "vector.h"
  32. #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
  33. #define GUMBO_STRING(literal) \
  34. { literal, sizeof(literal) - 1 }
  35. #define TERMINATOR \
  36. { "", 0 }
  37. typedef char gumbo_tagset[GUMBO_TAG_LAST];
  38. #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
  39. #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
  40. #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
  41. #define TAGSET_INCLUDES(tagset, namespace, tag) \
  42. (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace))
  43. // selected forward declarations as it is getting hard to find
  44. // an appropriate order
  45. static bool node_html_tag_is(const GumboNode*, GumboTag);
  46. static GumboInsertionMode get_current_template_insertion_mode(
  47. const GumboParser*);
  48. static bool handle_in_template(GumboParser*, GumboToken*);
  49. static void destroy_node(GumboParser*, GumboNode*);
  50. static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); }
  51. static void free_wrapper(void* unused, void* ptr) { free(ptr); }
  52. const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL,
  53. 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML};
  54. static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
  55. static const GumboStringPiece kPublicIdHtml4_0 =
  56. GUMBO_STRING("-//W3C//DTD HTML 4.0//EN");
  57. static const GumboStringPiece kPublicIdHtml4_01 =
  58. GUMBO_STRING("-//W3C//DTD HTML 4.01//EN");
  59. static const GumboStringPiece kPublicIdXhtml1_0 =
  60. GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
  61. static const GumboStringPiece kPublicIdXhtml1_1 =
  62. GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN");
  63. static const GumboStringPiece kSystemIdRecHtml4_0 =
  64. GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
  65. static const GumboStringPiece kSystemIdHtml4 =
  66. GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd");
  67. static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
  68. GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
  69. static const GumboStringPiece kSystemIdXhtml1_1 =
  70. GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
  71. static const GumboStringPiece kSystemIdLegacyCompat =
  72. GUMBO_STRING("about:legacy-compat");
  73. // The doctype arrays have an explicit terminator because we want to pass them
  74. // to a helper function, and passing them as a pointer discards sizeof
  75. // information. The SVG arrays are used only by one-off functions, and so loops
  76. // over them use sizeof directly instead of a terminator.
  77. static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
  78. GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
  79. GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
  80. GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
  81. GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
  82. GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
  83. GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
  84. GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
  85. GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
  86. GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
  87. GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
  88. GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
  89. GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
  90. GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
  91. GUMBO_STRING("-//IETF//DTD HTML 3//"),
  92. GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
  93. GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
  94. GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
  95. GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
  96. GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
  97. GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
  98. GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
  99. GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
  100. GUMBO_STRING("-//IETF//DTD HTML Strict//"),
  101. GUMBO_STRING("-//IETF//DTD HTML//"),
  102. GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
  103. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
  104. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
  105. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
  106. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
  107. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
  108. GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
  109. GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
  110. GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
  111. GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
  112. GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
  113. GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
  114. GUMBO_STRING(
  115. "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
  116. "extensions to HTML 4.0//"),
  117. GUMBO_STRING(
  118. "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
  119. "extensions to HTML 4.0//"),
  120. GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
  121. GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
  122. GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
  123. GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
  124. GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
  125. GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
  126. GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
  127. GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
  128. GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
  129. GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
  130. GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
  131. GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
  132. GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
  133. GUMBO_STRING("-//W3C//DTD W3 HTML//"),
  134. GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
  135. GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
  136. GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR};
  137. static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
  138. GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
  139. GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"),
  140. TERMINATOR};
  141. static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
  142. GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
  143. TERMINATOR};
  144. static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
  145. GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
  146. GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR};
  147. static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] =
  148. {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
  149. GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR};
  150. // Indexed by GumboNamespaceEnum; keep in sync with that.
  151. static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml",
  152. "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"};
  153. typedef struct _ReplacementEntry {
  154. const GumboStringPiece from;
  155. const GumboStringPiece to;
  156. } ReplacementEntry;
  157. #define REPLACEMENT_ENTRY(from, to) \
  158. { GUMBO_STRING(from), GUMBO_STRING(to) }
  159. // Static data for SVG attribute replacements.
  160. // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
  161. static const ReplacementEntry kSvgAttributeReplacements[] = {
  162. REPLACEMENT_ENTRY("attributename", "attributeName"),
  163. REPLACEMENT_ENTRY("attributetype", "attributeType"),
  164. REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
  165. REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
  166. REPLACEMENT_ENTRY("calcmode", "calcMode"),
  167. REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
  168. // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
  169. // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
  170. REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
  171. REPLACEMENT_ENTRY("edgemode", "edgeMode"),
  172. // REPLACEMENT_ENTRY("externalresourcesrequired",
  173. // "externalResourcesRequired"),
  174. // REPLACEMENT_ENTRY("filterres", "filterRes"),
  175. REPLACEMENT_ENTRY("filterunits", "filterUnits"),
  176. REPLACEMENT_ENTRY("glyphref", "glyphRef"),
  177. REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
  178. REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
  179. REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
  180. REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
  181. REPLACEMENT_ENTRY("keypoints", "keyPoints"),
  182. REPLACEMENT_ENTRY("keysplines", "keySplines"),
  183. REPLACEMENT_ENTRY("keytimes", "keyTimes"),
  184. REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
  185. REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
  186. REPLACEMENT_ENTRY("markerheight", "markerHeight"),
  187. REPLACEMENT_ENTRY("markerunits", "markerUnits"),
  188. REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
  189. REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
  190. REPLACEMENT_ENTRY("maskunits", "maskUnits"),
  191. REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
  192. REPLACEMENT_ENTRY("pathlength", "pathLength"),
  193. REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
  194. REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
  195. REPLACEMENT_ENTRY("patternunits", "patternUnits"),
  196. REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
  197. REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
  198. REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
  199. REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
  200. REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
  201. REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
  202. REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"),
  203. REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
  204. REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
  205. REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
  206. REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
  207. REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
  208. REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
  209. REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
  210. REPLACEMENT_ENTRY("startoffset", "startOffset"),
  211. REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
  212. REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
  213. REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
  214. REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
  215. REPLACEMENT_ENTRY("tablevalues", "tableValues"),
  216. REPLACEMENT_ENTRY("targetx", "targetX"),
  217. REPLACEMENT_ENTRY("targety", "targetY"),
  218. REPLACEMENT_ENTRY("textlength", "textLength"),
  219. REPLACEMENT_ENTRY("viewbox", "viewBox"),
  220. REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
  221. REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
  222. REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
  223. REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
  224. };
  225. static const ReplacementEntry kSvgTagReplacements[] = {
  226. REPLACEMENT_ENTRY("altglyph", "altGlyph"),
  227. REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
  228. REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
  229. REPLACEMENT_ENTRY("animatecolor", "animateColor"),
  230. REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
  231. REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
  232. REPLACEMENT_ENTRY("clippath", "clipPath"),
  233. REPLACEMENT_ENTRY("feblend", "feBlend"),
  234. REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
  235. REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
  236. REPLACEMENT_ENTRY("fecomposite", "feComposite"),
  237. REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
  238. REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
  239. REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
  240. REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
  241. REPLACEMENT_ENTRY("feflood", "feFlood"),
  242. REPLACEMENT_ENTRY("fefunca", "feFuncA"),
  243. REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
  244. REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
  245. REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
  246. REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
  247. REPLACEMENT_ENTRY("feimage", "feImage"),
  248. REPLACEMENT_ENTRY("femerge", "feMerge"),
  249. REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
  250. REPLACEMENT_ENTRY("femorphology", "feMorphology"),
  251. REPLACEMENT_ENTRY("feoffset", "feOffset"),
  252. REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
  253. REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
  254. REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
  255. REPLACEMENT_ENTRY("fetile", "feTile"),
  256. REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
  257. REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
  258. REPLACEMENT_ENTRY("glyphref", "glyphRef"),
  259. REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
  260. REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
  261. REPLACEMENT_ENTRY("textpath", "textPath"),
  262. };
  263. typedef struct _NamespacedAttributeReplacement {
  264. const char* from;
  265. const char* local_name;
  266. const GumboAttributeNamespaceEnum attr_namespace;
  267. } NamespacedAttributeReplacement;
  268. static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
  269. {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
  270. {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
  271. {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
  272. {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
  273. {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
  274. {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
  275. {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
  276. {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
  277. {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
  278. {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
  279. {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
  280. {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
  281. };
  282. // The "scope marker" for the list of active formatting elements. We use a
  283. // pointer to this as a generic marker element, since the particular element
  284. // scope doesn't matter.
  285. static const GumboNode kActiveFormattingScopeMarker;
  286. // The tag_is and tag_in function use true & false to denote start & end tags,
  287. // but for readability, we define constants for them here.
  288. static const bool kStartTag = true;
  289. static const bool kEndTag = false;
  290. // Because GumboStringPieces are immutable, we can't insert a character directly
  291. // into a text node. Instead, we accumulate all pending characters here and
  292. // flush them out to a text node whenever a new element is inserted.
  293. //
  294. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
  295. typedef struct _TextNodeBufferState {
  296. // The accumulated text to be inserted into the current text node.
  297. GumboStringBuffer _buffer;
  298. // A pointer to the original text represented by this text node. Note that
  299. // because of foster parenting and other strange DOM manipulations, this may
  300. // include other non-text HTML tags in it; it is defined as the span of
  301. // original text from the first character in this text node to the last
  302. // character in this text node.
  303. const char* _start_original_text;
  304. // The source position of the start of this text node.
  305. GumboSourcePosition _start_position;
  306. // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
  307. GumboNodeType _type;
  308. } TextNodeBufferState;
  309. typedef struct GumboInternalParserState {
  310. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
  311. GumboInsertionMode _insertion_mode;
  312. // Used for run_generic_parsing_algorithm, which needs to switch back to the
  313. // original insertion mode at its conclusion.
  314. GumboInsertionMode _original_insertion_mode;
  315. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
  316. GumboVector /*GumboNode*/ _open_elements;
  317. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
  318. GumboVector /*GumboNode*/ _active_formatting_elements;
  319. // The stack of template insertion modes.
  320. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
  321. GumboVector /*InsertionMode*/ _template_insertion_modes;
  322. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
  323. GumboNode* _head_element;
  324. GumboNode* _form_element;
  325. // The element used as fragment context when parsing in fragment mode
  326. GumboNode* _fragment_ctx;
  327. // The flag for when the spec says "Reprocess the current token in..."
  328. bool _reprocess_current_token;
  329. // The flag for "acknowledge the token's self-closing flag".
  330. bool _self_closing_flag_acknowledged;
  331. // The "frameset-ok" flag from the spec.
  332. bool _frameset_ok;
  333. // The flag for "If the next token is a LINE FEED, ignore that token...".
  334. bool _ignore_next_linefeed;
  335. // The flag for "whenever a node would be inserted into the current node, it
  336. // must instead be foster parented". This is used for misnested table
  337. // content, which needs to be handled according to "in body" rules yet foster
  338. // parented outside of the table.
  339. // It would perhaps be more explicit to have this as a parameter to
  340. // handle_in_body and insert_element, but given how special-purpose this is
  341. // and the number of call-sites that would need to take the extra parameter,
  342. // it's easier just to have a state flag.
  343. bool _foster_parent_insertions;
  344. // The accumulated text node buffer state.
  345. TextNodeBufferState _text_node;
  346. // The current token.
  347. GumboToken* _current_token;
  348. // The way that the spec is written, the </body> and </html> tags are *always*
  349. // implicit, because encountering one of those tokens merely switches the
  350. // insertion mode out of "in body". So we have individual state flags for
  351. // those end tags that are then inspected by pop_current_node when the <body>
  352. // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
  353. // flag appropriately.
  354. bool _closed_body_tag;
  355. bool _closed_html_tag;
  356. } GumboParserState;
  357. static bool token_has_attribute(const GumboToken* token, const char* name) {
  358. assert(token->type == GUMBO_TOKEN_START_TAG);
  359. return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
  360. }
  361. // Checks if the value of the specified attribute is a case-insensitive match
  362. // for the specified string.
  363. static bool attribute_matches(
  364. const GumboVector* attributes, const char* name, const char* value) {
  365. const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
  366. return attr ? strcasecmp(value, attr->value) == 0 : false;
  367. }
  368. // Checks if the value of the specified attribute is a case-sensitive match
  369. // for the specified string.
  370. static bool attribute_matches_case_sensitive(
  371. const GumboVector* attributes, const char* name, const char* value) {
  372. const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
  373. return attr ? strcmp(value, attr->value) == 0 : false;
  374. }
  375. // Checks if the specified attribute vectors are identical.
  376. static bool all_attributes_match(
  377. const GumboVector* attr1, const GumboVector* attr2) {
  378. unsigned int num_unmatched_attr2_elements = attr2->length;
  379. for (unsigned int i = 0; i < attr1->length; ++i) {
  380. const GumboAttribute* attr = attr1->data[i];
  381. if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
  382. --num_unmatched_attr2_elements;
  383. } else {
  384. return false;
  385. }
  386. }
  387. return num_unmatched_attr2_elements == 0;
  388. }
  389. static void set_frameset_not_ok(GumboParser* parser) {
  390. gumbo_debug("Setting frameset_ok to false.\n");
  391. parser->_parser_state->_frameset_ok = false;
  392. }
  393. static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
  394. GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
  395. node->parent = NULL;
  396. node->index_within_parent = -1;
  397. node->type = type;
  398. node->parse_flags = GUMBO_INSERTION_NORMAL;
  399. return node;
  400. }
  401. static GumboNode* new_document_node(GumboParser* parser) {
  402. GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
  403. document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
  404. gumbo_vector_init(parser, 1, &document_node->v.document.children);
  405. // Must be initialized explicitly, as there's no guarantee that we'll see a
  406. // doc type token.
  407. GumboDocument* document = &document_node->v.document;
  408. document->has_doctype = false;
  409. document->name = NULL;
  410. document->public_identifier = NULL;
  411. document->system_identifier = NULL;
  412. return document_node;
  413. }
  414. static void output_init(GumboParser* parser) {
  415. GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
  416. output->root = NULL;
  417. output->document = new_document_node(parser);
  418. parser->_output = output;
  419. gumbo_init_errors(parser);
  420. }
  421. static void parser_state_init(GumboParser* parser) {
  422. GumboParserState* parser_state =
  423. gumbo_parser_allocate(parser, sizeof(GumboParserState));
  424. parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
  425. parser_state->_reprocess_current_token = false;
  426. parser_state->_frameset_ok = true;
  427. parser_state->_ignore_next_linefeed = false;
  428. parser_state->_foster_parent_insertions = false;
  429. parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
  430. gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
  431. gumbo_vector_init(parser, 10, &parser_state->_open_elements);
  432. gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
  433. gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
  434. parser_state->_head_element = NULL;
  435. parser_state->_form_element = NULL;
  436. parser_state->_fragment_ctx = NULL;
  437. parser_state->_current_token = NULL;
  438. parser_state->_closed_body_tag = false;
  439. parser_state->_closed_html_tag = false;
  440. parser->_parser_state = parser_state;
  441. }
  442. static void parser_state_destroy(GumboParser* parser) {
  443. GumboParserState* state = parser->_parser_state;
  444. if (state->_fragment_ctx) {
  445. destroy_node(parser, state->_fragment_ctx);
  446. }
  447. gumbo_vector_destroy(parser, &state->_active_formatting_elements);
  448. gumbo_vector_destroy(parser, &state->_open_elements);
  449. gumbo_vector_destroy(parser, &state->_template_insertion_modes);
  450. gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
  451. gumbo_parser_deallocate(parser, state);
  452. }
  453. static GumboNode* get_document_node(GumboParser* parser) {
  454. return parser->_output->document;
  455. }
  456. static bool is_fragment_parser(const GumboParser* parser) {
  457. return !!parser->_parser_state->_fragment_ctx;
  458. }
  459. // Returns the node at the bottom of the stack of open elements, or NULL if no
  460. // elements have been added yet.
  461. static GumboNode* get_current_node(GumboParser* parser) {
  462. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  463. if (open_elements->length == 0) {
  464. assert(!parser->_output->root);
  465. return NULL;
  466. }
  467. assert(open_elements->length > 0);
  468. assert(open_elements->data != NULL);
  469. return open_elements->data[open_elements->length - 1];
  470. }
  471. static GumboNode* get_adjusted_current_node(GumboParser* parser) {
  472. GumboParserState* state = parser->_parser_state;
  473. if (state->_open_elements.length == 1 && state->_fragment_ctx) {
  474. return state->_fragment_ctx;
  475. }
  476. return get_current_node(parser);
  477. }
  478. // Returns true if the given needle is in the given array of literal
  479. // GumboStringPieces. If exact_match is true, this requires that they match
  480. // exactly; otherwise, this performs a prefix match to check if any of the
  481. // elements in haystack start with needle. This always performs a
  482. // case-insensitive match.
  483. static bool is_in_static_list(
  484. const char* needle, const GumboStringPiece* haystack, bool exact_match) {
  485. for (unsigned int i = 0; haystack[i].length > 0; ++i) {
  486. if ((exact_match && !strcmp(needle, haystack[i].data)) ||
  487. (!exact_match && !strcasecmp(needle, haystack[i].data))) {
  488. return true;
  489. }
  490. }
  491. return false;
  492. }
  493. static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
  494. parser->_parser_state->_insertion_mode = mode;
  495. }
  496. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
  497. // This is a helper function that returns the appropriate insertion mode instead
  498. // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
  499. // indicate that there is no appropriate insertion mode, and the loop should
  500. // continue.
  501. static GumboInsertionMode get_appropriate_insertion_mode(
  502. const GumboParser* parser, int index) {
  503. const GumboVector* open_elements = &parser->_parser_state->_open_elements;
  504. const GumboNode* node = open_elements->data[index];
  505. const bool is_last = index == 0;
  506. if (is_last && is_fragment_parser(parser)) {
  507. node = parser->_parser_state->_fragment_ctx;
  508. }
  509. assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
  510. if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML)
  511. return is_last ?
  512. GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
  513. switch (node->v.element.tag) {
  514. case GUMBO_TAG_SELECT: {
  515. if (is_last) {
  516. return GUMBO_INSERTION_MODE_IN_SELECT;
  517. }
  518. for (int i = index; i > 0; --i) {
  519. const GumboNode* ancestor = open_elements->data[i];
  520. if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
  521. return GUMBO_INSERTION_MODE_IN_SELECT;
  522. }
  523. if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
  524. return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
  525. }
  526. }
  527. return GUMBO_INSERTION_MODE_IN_SELECT;
  528. }
  529. case GUMBO_TAG_TD:
  530. case GUMBO_TAG_TH:
  531. if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
  532. break;
  533. case GUMBO_TAG_TR:
  534. return GUMBO_INSERTION_MODE_IN_ROW;
  535. case GUMBO_TAG_TBODY:
  536. case GUMBO_TAG_THEAD:
  537. case GUMBO_TAG_TFOOT:
  538. return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
  539. case GUMBO_TAG_CAPTION:
  540. return GUMBO_INSERTION_MODE_IN_CAPTION;
  541. case GUMBO_TAG_COLGROUP:
  542. return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
  543. case GUMBO_TAG_TABLE:
  544. return GUMBO_INSERTION_MODE_IN_TABLE;
  545. case GUMBO_TAG_TEMPLATE:
  546. return get_current_template_insertion_mode(parser);
  547. case GUMBO_TAG_HEAD:
  548. if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
  549. break;
  550. case GUMBO_TAG_BODY:
  551. return GUMBO_INSERTION_MODE_IN_BODY;
  552. case GUMBO_TAG_FRAMESET:
  553. return GUMBO_INSERTION_MODE_IN_FRAMESET;
  554. case GUMBO_TAG_HTML:
  555. return parser->_parser_state->_head_element
  556. ? GUMBO_INSERTION_MODE_AFTER_HEAD
  557. : GUMBO_INSERTION_MODE_BEFORE_HEAD;
  558. default:
  559. break;
  560. }
  561. return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
  562. }
  563. // This performs the actual "reset the insertion mode" loop.
  564. static void reset_insertion_mode_appropriately(GumboParser* parser) {
  565. const GumboVector* open_elements = &parser->_parser_state->_open_elements;
  566. for (int i = open_elements->length; --i >= 0;) {
  567. GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
  568. if (mode != GUMBO_INSERTION_MODE_INITIAL) {
  569. set_insertion_mode(parser, mode);
  570. return;
  571. }
  572. }
  573. // Should never get here, because is_last will be set on the last iteration
  574. // and will force GUMBO_INSERTION_MODE_IN_BODY.
  575. assert(0);
  576. }
  577. static GumboError* parser_add_parse_error(
  578. GumboParser* parser, const GumboToken* token) {
  579. gumbo_debug("Adding parse error.\n");
  580. GumboError* error = gumbo_add_error(parser);
  581. if (!error) {
  582. return NULL;
  583. }
  584. error->type = GUMBO_ERR_PARSER;
  585. error->position = token->position;
  586. error->original_text = token->original_text.data;
  587. GumboParserError* extra_data = &error->v.parser;
  588. extra_data->input_type = token->type;
  589. extra_data->input_tag = GUMBO_TAG_UNKNOWN;
  590. if (token->type == GUMBO_TOKEN_START_TAG) {
  591. extra_data->input_tag = token->v.start_tag.tag;
  592. } else if (token->type == GUMBO_TOKEN_END_TAG) {
  593. extra_data->input_tag = token->v.end_tag;
  594. }
  595. GumboParserState* state = parser->_parser_state;
  596. extra_data->parser_state = state->_insertion_mode;
  597. gumbo_vector_init(
  598. parser, state->_open_elements.length, &extra_data->tag_stack);
  599. for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
  600. const GumboNode* node = state->_open_elements.data[i];
  601. assert(
  602. node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
  603. gumbo_vector_add(
  604. parser, (void*) node->v.element.tag, &extra_data->tag_stack);
  605. }
  606. return error;
  607. }
  608. // Returns true if the specified token is either a start or end tag (specified
  609. // by is_start) with one of the tag types in the varargs list. Terminate the
  610. // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
  611. // the spec references tags that are not in the spec.
  612. static bool tag_in(
  613. const GumboToken* token, bool is_start, const gumbo_tagset tags) {
  614. GumboTag token_tag;
  615. if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
  616. token_tag = token->v.start_tag.tag;
  617. } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
  618. token_tag = token->v.end_tag;
  619. } else {
  620. return false;
  621. }
  622. return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0);
  623. }
  624. // Like tag_in, but for the single-tag case.
  625. static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
  626. if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
  627. return token->v.start_tag.tag == tag;
  628. } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
  629. return token->v.end_tag == tag;
  630. } else {
  631. return false;
  632. }
  633. }
  634. // Like tag_in, but checks for the tag of a node, rather than a token.
  635. static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
  636. assert(node != NULL);
  637. if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
  638. return false;
  639. }
  640. return TAGSET_INCLUDES(
  641. tags, node->v.element.tag_namespace, node->v.element.tag);
  642. }
  643. // Like node_tag_in, but for the single-tag case.
  644. static bool node_qualified_tag_is(
  645. const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
  646. assert(node);
  647. return (node->type == GUMBO_NODE_ELEMENT ||
  648. node->type == GUMBO_NODE_TEMPLATE) &&
  649. node->v.element.tag == tag && node->v.element.tag_namespace == ns;
  650. }
  651. // Like node_tag_in, but for the single-tag case in the HTML namespace
  652. static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
  653. return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
  654. }
  655. static void push_template_insertion_mode(
  656. GumboParser* parser, GumboInsertionMode mode) {
  657. gumbo_vector_add(
  658. parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
  659. }
  660. static void pop_template_insertion_mode(GumboParser* parser) {
  661. gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
  662. }
  663. // Returns the current template insertion mode. If the stack of template
  664. // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
  665. static GumboInsertionMode get_current_template_insertion_mode(
  666. const GumboParser* parser) {
  667. GumboVector* template_insertion_modes =
  668. &parser->_parser_state->_template_insertion_modes;
  669. if (template_insertion_modes->length == 0) {
  670. return GUMBO_INSERTION_MODE_INITIAL;
  671. }
  672. return (GumboInsertionMode)
  673. template_insertion_modes->data[(template_insertion_modes->length - 1)];
  674. }
  675. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
  676. static bool is_mathml_integration_point(const GumboNode* node) {
  677. return node_tag_in_set(
  678. node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
  679. TAG_MATHML(MS), TAG_MATHML(MTEXT)});
  680. }
  681. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
  682. static bool is_html_integration_point(const GumboNode* node) {
  683. return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT),
  684. TAG_SVG(DESC), TAG_SVG(TITLE)}) ||
  685. (node_qualified_tag_is(
  686. node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
  687. (attribute_matches(
  688. &node->v.element.attributes, "encoding", "text/html") ||
  689. attribute_matches(&node->v.element.attributes, "encoding",
  690. "application/xhtml+xml")));
  691. }
  692. // This represents a place to insert a node, consisting of a target parent and a
  693. // child index within that parent. If the node should be inserted at the end of
  694. // the parent's child, index will be -1.
  695. typedef struct {
  696. GumboNode* target;
  697. int index;
  698. } InsertionLocation;
  699. InsertionLocation get_appropriate_insertion_location(
  700. GumboParser* parser, GumboNode* override_target) {
  701. InsertionLocation retval = {override_target, -1};
  702. if (retval.target == NULL) {
  703. // No override target; default to the current node, but special-case the
  704. // root node since get_current_node() assumes the stack of open elements is
  705. // non-empty.
  706. retval.target = parser->_output->root != NULL ? get_current_node(parser)
  707. : get_document_node(parser);
  708. }
  709. if (!parser->_parser_state->_foster_parent_insertions ||
  710. !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
  711. TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
  712. return retval;
  713. }
  714. // Foster-parenting case.
  715. int last_template_index = -1;
  716. int last_table_index = -1;
  717. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  718. for (unsigned int i = 0; i < open_elements->length; ++i) {
  719. if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
  720. last_template_index = i;
  721. }
  722. if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
  723. last_table_index = i;
  724. }
  725. }
  726. if (last_template_index != -1 &&
  727. (last_table_index == -1 || last_template_index > last_table_index)) {
  728. retval.target = open_elements->data[last_template_index];
  729. return retval;
  730. }
  731. if (last_table_index == -1) {
  732. retval.target = open_elements->data[0];
  733. return retval;
  734. }
  735. GumboNode* last_table = open_elements->data[last_table_index];
  736. if (last_table->parent != NULL) {
  737. retval.target = last_table->parent;
  738. retval.index = last_table->index_within_parent;
  739. return retval;
  740. }
  741. retval.target = open_elements->data[last_table_index - 1];
  742. return retval;
  743. }
  744. // Appends a node to the end of its parent, setting the "parent" and
  745. // "index_within_parent" fields appropriately.
  746. static void append_node(
  747. GumboParser* parser, GumboNode* parent, GumboNode* node) {
  748. assert(node->parent == NULL);
  749. assert(node->index_within_parent == -1);
  750. GumboVector* children;
  751. if (parent->type == GUMBO_NODE_ELEMENT ||
  752. parent->type == GUMBO_NODE_TEMPLATE) {
  753. children = &parent->v.element.children;
  754. } else {
  755. assert(parent->type == GUMBO_NODE_DOCUMENT);
  756. children = &parent->v.document.children;
  757. }
  758. node->parent = parent;
  759. node->index_within_parent = children->length;
  760. gumbo_vector_add(parser, (void*) node, children);
  761. assert(node->index_within_parent < children->length);
  762. }
  763. // Inserts a node at the specified InsertionLocation, updating the
  764. // "parent" and "index_within_parent" fields of it and all its siblings.
  765. // If the index of the location is -1, this calls append_node.
  766. static void insert_node(
  767. GumboParser* parser, GumboNode* node, InsertionLocation location) {
  768. assert(node->parent == NULL);
  769. assert(node->index_within_parent == -1);
  770. GumboNode* parent = location.target;
  771. int index = location.index;
  772. if (index != -1) {
  773. GumboVector* children = NULL;
  774. if (parent->type == GUMBO_NODE_ELEMENT ||
  775. parent->type == GUMBO_NODE_TEMPLATE) {
  776. children = &parent->v.element.children;
  777. } else if (parent->type == GUMBO_NODE_DOCUMENT) {
  778. children = &parent->v.document.children;
  779. assert(children->length == 0);
  780. } else {
  781. assert(0);
  782. }
  783. assert(index >= 0);
  784. assert((unsigned int) index < children->length);
  785. node->parent = parent;
  786. node->index_within_parent = index;
  787. gumbo_vector_insert_at(parser, (void*) node, index, children);
  788. assert(node->index_within_parent < children->length);
  789. for (unsigned int i = index + 1; i < children->length; ++i) {
  790. GumboNode* sibling = children->data[i];
  791. sibling->index_within_parent = i;
  792. assert(sibling->index_within_parent < children->length);
  793. }
  794. } else {
  795. append_node(parser, parent, node);
  796. }
  797. }
  798. static void maybe_flush_text_node_buffer(GumboParser* parser) {
  799. GumboParserState* state = parser->_parser_state;
  800. TextNodeBufferState* buffer_state = &state->_text_node;
  801. if (buffer_state->_buffer.length == 0) {
  802. return;
  803. }
  804. assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
  805. buffer_state->_type == GUMBO_NODE_TEXT ||
  806. buffer_state->_type == GUMBO_NODE_CDATA);
  807. GumboNode* text_node = create_node(parser, buffer_state->_type);
  808. GumboText* text_node_data = &text_node->v.text;
  809. text_node_data->text =
  810. gumbo_string_buffer_to_string(parser, &buffer_state->_buffer);
  811. text_node_data->original_text.data = buffer_state->_start_original_text;
  812. text_node_data->original_text.length =
  813. state->_current_token->original_text.data -
  814. buffer_state->_start_original_text;
  815. text_node_data->start_pos = buffer_state->_start_position;
  816. gumbo_debug("Flushing text node buffer of %.*s.\n",
  817. (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
  818. InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
  819. if (location.target->type == GUMBO_NODE_DOCUMENT) {
  820. // The DOM does not allow Document nodes to have Text children, so per the
  821. // spec, they are dropped on the floor.
  822. destroy_node(parser, text_node);
  823. } else {
  824. insert_node(parser, text_node, location);
  825. }
  826. gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
  827. buffer_state->_type = GUMBO_NODE_WHITESPACE;
  828. assert(buffer_state->_buffer.length == 0);
  829. }
  830. static void record_end_of_element(
  831. GumboToken* current_token, GumboElement* element) {
  832. element->end_pos = current_token->position;
  833. element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG
  834. ? current_token->original_text
  835. : kGumboEmptyString;
  836. }
  837. static GumboNode* pop_current_node(GumboParser* parser) {
  838. GumboParserState* state = parser->_parser_state;
  839. maybe_flush_text_node_buffer(parser);
  840. if (state->_open_elements.length > 0) {
  841. assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
  842. gumbo_debug("Popping %s node.\n",
  843. gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
  844. }
  845. GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
  846. if (!current_node) {
  847. assert(state->_open_elements.length == 0);
  848. return NULL;
  849. }
  850. assert(current_node->type == GUMBO_NODE_ELEMENT ||
  851. current_node->type == GUMBO_NODE_TEMPLATE);
  852. bool is_closed_body_or_html_tag =
  853. (node_html_tag_is(current_node, GUMBO_TAG_BODY) &&
  854. state->_closed_body_tag) ||
  855. (node_html_tag_is(current_node, GUMBO_TAG_HTML) &&
  856. state->_closed_html_tag);
  857. if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
  858. !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
  859. !is_closed_body_or_html_tag) {
  860. current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
  861. }
  862. if (!is_closed_body_or_html_tag) {
  863. record_end_of_element(state->_current_token, &current_node->v.element);
  864. }
  865. return current_node;
  866. }
  867. static void append_comment_node(
  868. GumboParser* parser, GumboNode* node, const GumboToken* token) {
  869. maybe_flush_text_node_buffer(parser);
  870. GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
  871. comment->type = GUMBO_NODE_COMMENT;
  872. comment->parse_flags = GUMBO_INSERTION_NORMAL;
  873. comment->v.text.text = token->v.text;
  874. comment->v.text.original_text = token->original_text;
  875. comment->v.text.start_pos = token->position;
  876. append_node(parser, node, comment);
  877. }
  878. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
  879. static void clear_stack_to_table_row_context(GumboParser* parser) {
  880. while (!node_tag_in_set(get_current_node(parser),
  881. (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
  882. pop_current_node(parser);
  883. }
  884. }
  885. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
  886. static void clear_stack_to_table_context(GumboParser* parser) {
  887. while (!node_tag_in_set(get_current_node(parser),
  888. (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) {
  889. pop_current_node(parser);
  890. }
  891. }
  892. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
  893. void clear_stack_to_table_body_context(GumboParser* parser) {
  894. while (!node_tag_in_set(get_current_node(parser),
  895. (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
  896. TAG(TEMPLATE)})) {
  897. pop_current_node(parser);
  898. }
  899. }
  900. // Creates a parser-inserted element in the HTML namespace and returns it.
  901. static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
  902. GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
  903. GumboElement* element = &node->v.element;
  904. gumbo_vector_init(parser, 1, &element->children);
  905. gumbo_vector_init(parser, 0, &element->attributes);
  906. element->tag = tag;
  907. element->tag_namespace = GUMBO_NAMESPACE_HTML;
  908. element->original_tag = kGumboEmptyString;
  909. element->original_end_tag = kGumboEmptyString;
  910. element->start_pos = (parser->_parser_state->_current_token)
  911. ? parser->_parser_state->_current_token->position
  912. : kGumboEmptySourcePosition;
  913. element->end_pos = kGumboEmptySourcePosition;
  914. return node;
  915. }
  916. // Constructs an element from the given start tag token.
  917. static GumboNode* create_element_from_token(
  918. GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
  919. assert(token->type == GUMBO_TOKEN_START_TAG);
  920. GumboTokenStartTag* start_tag = &token->v.start_tag;
  921. GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML &&
  922. start_tag->tag == GUMBO_TAG_TEMPLATE)
  923. ? GUMBO_NODE_TEMPLATE
  924. : GUMBO_NODE_ELEMENT;
  925. GumboNode* node = create_node(parser, type);
  926. GumboElement* element = &node->v.element;
  927. gumbo_vector_init(parser, 1, &element->children);
  928. element->attributes = start_tag->attributes;
  929. element->tag = start_tag->tag;
  930. element->tag_namespace = tag_namespace;
  931. assert(token->original_text.length >= 2);
  932. assert(token->original_text.data[0] == '<');
  933. assert(token->original_text.data[token->original_text.length - 1] == '>');
  934. element->original_tag = token->original_text;
  935. element->start_pos = token->position;
  936. element->original_end_tag = kGumboEmptyString;
  937. element->end_pos = kGumboEmptySourcePosition;
  938. // The element takes ownership of the attributes from the token, so any
  939. // allocated-memory fields should be nulled out.
  940. start_tag->attributes = kGumboEmptyVector;
  941. return node;
  942. }
  943. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
  944. static void insert_element(GumboParser* parser, GumboNode* node,
  945. bool is_reconstructing_formatting_elements) {
  946. GumboParserState* state = parser->_parser_state;
  947. // NOTE(jdtang): The text node buffer must always be flushed before inserting
  948. // a node, otherwise we're handling nodes in a different order than the spec
  949. // mandated. However, one clause of the spec (character tokens in the body)
  950. // requires that we reconstruct the active formatting elements *before* adding
  951. // the character, and reconstructing the active formatting elements may itself
  952. // result in the insertion of new elements (which should be pushed onto the
  953. // stack of open elements before the buffer is flushed). We solve this (for
  954. // the time being, the spec has been rewritten for <template> and the new
  955. // version may be simpler here) with a boolean flag to this method.
  956. if (!is_reconstructing_formatting_elements) {
  957. maybe_flush_text_node_buffer(parser);
  958. }
  959. InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
  960. insert_node(parser, node, location);
  961. gumbo_vector_add(parser, (void*) node, &state->_open_elements);
  962. }
  963. // Convenience method that combines create_element_from_token and
  964. // insert_element, inserting the generated element directly into the current
  965. // node. Returns the node inserted.
  966. static GumboNode* insert_element_from_token(
  967. GumboParser* parser, GumboToken* token) {
  968. GumboNode* element =
  969. create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
  970. insert_element(parser, element, false);
  971. gumbo_debug("Inserting <%s> element (@%x) from token.\n",
  972. gumbo_normalized_tagname(element->v.element.tag), element);
  973. return element;
  974. }
  975. // Convenience method that combines create_element and insert_element, inserting
  976. // a parser-generated element of a specific tag type. Returns the node
  977. // inserted.
  978. static GumboNode* insert_element_of_tag_type(
  979. GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
  980. GumboNode* element = create_element(parser, tag);
  981. element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
  982. insert_element(parser, element, false);
  983. gumbo_debug("Inserting %s element (@%x) from tag type.\n",
  984. gumbo_normalized_tagname(tag), element);
  985. return element;
  986. }
  987. // Convenience method for creating foreign namespaced element. Returns the node
  988. // inserted.
  989. static GumboNode* insert_foreign_element(
  990. GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
  991. assert(token->type == GUMBO_TOKEN_START_TAG);
  992. GumboNode* element = create_element_from_token(parser, token, tag_namespace);
  993. insert_element(parser, element, false);
  994. if (token_has_attribute(token, "xmlns") &&
  995. !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns",
  996. kLegalXmlns[tag_namespace])) {
  997. // TODO(jdtang): Since there're multiple possible error codes here, we
  998. // eventually need reason codes to differentiate them.
  999. parser_add_parse_error(parser, token);
  1000. }
  1001. if (token_has_attribute(token, "xmlns:xlink") &&
  1002. !attribute_matches_case_sensitive(&token->v.start_tag.attributes,
  1003. "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
  1004. parser_add_parse_error(parser, token);
  1005. }
  1006. return element;
  1007. }
  1008. static void insert_text_token(GumboParser* parser, GumboToken* token) {
  1009. assert(token->type == GUMBO_TOKEN_WHITESPACE ||
  1010. token->type == GUMBO_TOKEN_CHARACTER ||
  1011. token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA);
  1012. TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
  1013. if (buffer_state->_buffer.length == 0) {
  1014. // Initialize position fields.
  1015. buffer_state->_start_original_text = token->original_text.data;
  1016. buffer_state->_start_position = token->position;
  1017. }
  1018. gumbo_string_buffer_append_codepoint(
  1019. parser, token->v.character, &buffer_state->_buffer);
  1020. if (token->type == GUMBO_TOKEN_CHARACTER) {
  1021. buffer_state->_type = GUMBO_NODE_TEXT;
  1022. } else if (token->type == GUMBO_TOKEN_CDATA) {
  1023. buffer_state->_type = GUMBO_NODE_CDATA;
  1024. }
  1025. gumbo_debug("Inserting text token '%c'.\n", token->v.character);
  1026. }
  1027. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
  1028. static void run_generic_parsing_algorithm(
  1029. GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
  1030. insert_element_from_token(parser, token);
  1031. gumbo_tokenizer_set_state(parser, lexer_state);
  1032. parser->_parser_state->_original_insertion_mode =
  1033. parser->_parser_state->_insertion_mode;
  1034. parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
  1035. }
  1036. static void acknowledge_self_closing_tag(GumboParser* parser) {
  1037. parser->_parser_state->_self_closing_flag_acknowledged = true;
  1038. }
  1039. // Returns true if there's an anchor tag in the list of active formatting
  1040. // elements, and fills in its index if so.
  1041. static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
  1042. GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
  1043. for (int i = elements->length; --i >= 0;) {
  1044. GumboNode* node = elements->data[i];
  1045. if (node == &kActiveFormattingScopeMarker) {
  1046. return false;
  1047. }
  1048. if (node_html_tag_is(node, GUMBO_TAG_A)) {
  1049. *anchor_index = i;
  1050. return true;
  1051. }
  1052. }
  1053. return false;
  1054. }
  1055. // Counts the number of open formatting elements in the list of active
  1056. // formatting elements (after the last active scope marker) that have a specific
  1057. // tag. If this is > 0, then earliest_matching_index will be filled in with the
  1058. // index of the first such element.
  1059. static int count_formatting_elements_of_tag(GumboParser* parser,
  1060. const GumboNode* desired_node, int* earliest_matching_index) {
  1061. const GumboElement* desired_element = &desired_node->v.element;
  1062. GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
  1063. int num_identical_elements = 0;
  1064. for (int i = elements->length; --i >= 0;) {
  1065. GumboNode* node = elements->data[i];
  1066. if (node == &kActiveFormattingScopeMarker) {
  1067. break;
  1068. }
  1069. assert(node->type == GUMBO_NODE_ELEMENT);
  1070. if (node_qualified_tag_is(
  1071. node, desired_element->tag_namespace, desired_element->tag) &&
  1072. all_attributes_match(
  1073. &node->v.element.attributes, &desired_element->attributes)) {
  1074. num_identical_elements++;
  1075. *earliest_matching_index = i;
  1076. }
  1077. }
  1078. return num_identical_elements;
  1079. }
  1080. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
  1081. static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
  1082. assert(node == &kActiveFormattingScopeMarker ||
  1083. node->type == GUMBO_NODE_ELEMENT);
  1084. GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
  1085. if (node == &kActiveFormattingScopeMarker) {
  1086. gumbo_debug("Adding a scope marker.\n");
  1087. } else {
  1088. gumbo_debug("Adding a formatting element.\n");
  1089. }
  1090. // Hunt for identical elements.
  1091. int earliest_identical_element = elements->length;
  1092. int num_identical_elements = count_formatting_elements_of_tag(
  1093. parser, node, &earliest_identical_element);
  1094. // Noah's Ark clause: if there're at least 3, remove the earliest.
  1095. if (num_identical_elements >= 3) {
  1096. gumbo_debug("Noah's ark clause: removing element at %d.\n",
  1097. earliest_identical_element);
  1098. gumbo_vector_remove_at(parser, earliest_identical_element, elements);
  1099. }
  1100. gumbo_vector_add(parser, (void*) node, elements);
  1101. }
  1102. static bool is_open_element(GumboParser* parser, const GumboNode* node) {
  1103. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  1104. for (unsigned int i = 0; i < open_elements->length; ++i) {
  1105. if (open_elements->data[i] == node) {
  1106. return true;
  1107. }
  1108. }
  1109. return false;
  1110. }
  1111. // Clones attributes, tags, etc. of a node, but does not copy the content. The
  1112. // clone shares no structure with the original node: all owned strings and
  1113. // values are fresh copies.
  1114. GumboNode* clone_node(
  1115. GumboParser* parser, GumboNode* node, GumboParseFlags reason) {
  1116. assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
  1117. GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
  1118. *new_node = *node;
  1119. new_node->parent = NULL;
  1120. new_node->index_within_parent = -1;
  1121. // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
  1122. // have a separate end tag.
  1123. new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
  1124. new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
  1125. GumboElement* element = &new_node->v.element;
  1126. gumbo_vector_init(parser, 1, &element->children);
  1127. const GumboVector* old_attributes = &node->v.element.attributes;
  1128. gumbo_vector_init(parser, old_attributes->length, &element->attributes);
  1129. for (unsigned int i = 0; i < old_attributes->length; ++i) {
  1130. const GumboAttribute* old_attr = old_attributes->data[i];
  1131. GumboAttribute* attr =
  1132. gumbo_parser_allocate(parser, sizeof(GumboAttribute));
  1133. *attr = *old_attr;
  1134. attr->name = gumbo_copy_stringz(parser, old_attr->name);
  1135. attr->value = gumbo_copy_stringz(parser, old_attr->value);
  1136. gumbo_vector_add(parser, attr, &element->attributes);
  1137. }
  1138. return new_node;
  1139. }
  1140. // "Reconstruct active formatting elements" part of the spec.
  1141. // This implementation is based on the html5lib translation from the mess of
  1142. // GOTOs in the spec to reasonably structured programming.
  1143. // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
  1144. static void reconstruct_active_formatting_elements(GumboParser* parser) {
  1145. GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
  1146. // Step 1
  1147. if (elements->length == 0) {
  1148. return;
  1149. }
  1150. // Step 2 & 3
  1151. unsigned int i = elements->length - 1;
  1152. GumboNode* element = elements->data[i];
  1153. if (element == &kActiveFormattingScopeMarker ||
  1154. is_open_element(parser, element)) {
  1155. return;
  1156. }
  1157. // Step 6
  1158. do {
  1159. if (i == 0) {
  1160. // Step 4
  1161. i = -1; // Incremented to 0 below.
  1162. break;
  1163. }
  1164. // Step 5
  1165. element = elements->data[--i];
  1166. } while (element != &kActiveFormattingScopeMarker &&
  1167. !is_open_element(parser, element));
  1168. ++i;
  1169. gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
  1170. gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
  1171. for (; i < elements->length; ++i) {
  1172. // Step 7 & 8.
  1173. assert(elements->length > 0);
  1174. assert(i < elements->length);
  1175. element = elements->data[i];
  1176. assert(element != &kActiveFormattingScopeMarker);
  1177. GumboNode* clone = clone_node(
  1178. parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
  1179. // Step 9.
  1180. InsertionLocation location =
  1181. get_appropriate_insertion_location(parser, NULL);
  1182. insert_node(parser, clone, location);
  1183. gumbo_vector_add(
  1184. parser, (void*) clone, &parser->_parser_state->_open_elements);
  1185. // Step 10.
  1186. elements->data[i] = clone;
  1187. gumbo_debug("Reconstructed %s element at %d.\n",
  1188. gumbo_normalized_tagname(clone->v.element.tag), i);
  1189. }
  1190. }
  1191. static void clear_active_formatting_elements(GumboParser* parser) {
  1192. GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
  1193. int num_elements_cleared = 0;
  1194. const GumboNode* node;
  1195. do {
  1196. node = gumbo_vector_pop(parser, elements);
  1197. ++num_elements_cleared;
  1198. } while (node && node != &kActiveFormattingScopeMarker);
  1199. gumbo_debug("Cleared %d elements from active formatting list.\n",
  1200. num_elements_cleared);
  1201. }
  1202. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
  1203. static GumboQuirksModeEnum compute_quirks_mode(
  1204. const GumboTokenDocType* doctype) {
  1205. if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) ||
  1206. is_in_static_list(
  1207. doctype->public_identifier, kQuirksModePublicIdPrefixes, false) ||
  1208. is_in_static_list(
  1209. doctype->public_identifier, kQuirksModePublicIdExactMatches, true) ||
  1210. is_in_static_list(
  1211. doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) ||
  1212. (is_in_static_list(doctype->public_identifier,
  1213. kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
  1214. !doctype->has_system_identifier)) {
  1215. return GUMBO_DOCTYPE_QUIRKS;
  1216. } else if (is_in_static_list(doctype->public_identifier,
  1217. kLimitedQuirksPublicIdPrefixes, false) ||
  1218. (is_in_static_list(doctype->public_identifier,
  1219. kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
  1220. doctype->has_system_identifier)) {
  1221. return GUMBO_DOCTYPE_LIMITED_QUIRKS;
  1222. }
  1223. return GUMBO_DOCTYPE_NO_QUIRKS;
  1224. }
  1225. // The following functions are all defined by the "has an element in __ scope"
  1226. // sections of the HTML5 spec:
  1227. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
  1228. // The basic idea behind them is that they check for an element of the given
  1229. // qualified name, contained within a scope formed by a set of other qualified
  1230. // names. For example, "has an element in list scope" looks for an element of
  1231. // the given qualified name within the nearest enclosing <ol> or <ul>, along
  1232. // with a bunch of generic element types that serve to "firewall" their content
  1233. // from the rest of the document. Note that because of the way the spec is
  1234. // written,
  1235. // all elements are expected to be in the HTML namespace
  1236. static bool has_an_element_in_specific_scope(GumboParser* parser,
  1237. int expected_size, const GumboTag* expected, bool negate,
  1238. const gumbo_tagset tags) {
  1239. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  1240. for (int i = open_elements->length; --i >= 0;) {
  1241. const GumboNode* node = open_elements->data[i];
  1242. if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
  1243. continue;
  1244. GumboTag node_tag = node->v.element.tag;
  1245. GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
  1246. for (int j = 0; j < expected_size; ++j) {
  1247. if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
  1248. return true;
  1249. }
  1250. bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
  1251. if (negate != found) return false;
  1252. }
  1253. return false;
  1254. }
  1255. // Checks for the presence of an open element of the specified tag type.
  1256. static bool has_open_element(GumboParser* parser, GumboTag tag) {
  1257. return has_an_element_in_specific_scope(
  1258. parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)});
  1259. }
  1260. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
  1261. static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
  1262. return has_an_element_in_specific_scope(parser, 1, &tag, false,
  1263. (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
  1264. TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
  1265. TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
  1266. TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
  1267. TAG_SVG(TITLE)});
  1268. }
  1269. // Like "has an element in scope", but for the specific case of looking for a
  1270. // unique target node, not for any node with a given tag name. This duplicates
  1271. // much of the algorithm from has_an_element_in_specific_scope because the
  1272. // predicate is different when checking for an exact node, and it's easier &
  1273. // faster just to duplicate the code for this one case than to try and
  1274. // parameterize it.
  1275. static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
  1276. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  1277. for (int i = open_elements->length; --i >= 0;) {
  1278. const GumboNode* current = open_elements->data[i];
  1279. if (current == node) {
  1280. return true;
  1281. }
  1282. if (current->type != GUMBO_NODE_ELEMENT &&
  1283. current->type != GUMBO_NODE_TEMPLATE) {
  1284. continue;
  1285. }
  1286. if (node_tag_in_set(current,
  1287. (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE),
  1288. TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
  1289. TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
  1290. TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
  1291. TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) {
  1292. return false;
  1293. }
  1294. }
  1295. assert(false);
  1296. return false;
  1297. }
  1298. // Like has_an_element_in_scope, but restricts the expected qualified name to a
  1299. // range of possible qualified names instead of just a single one.
  1300. static bool has_an_element_in_scope_with_tagname(
  1301. GumboParser* parser, int expected_len, const GumboTag expected[]) {
  1302. return has_an_element_in_specific_scope(parser, expected_len, expected, false,
  1303. (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
  1304. TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
  1305. TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
  1306. TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
  1307. TAG_SVG(TITLE)});
  1308. }
  1309. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
  1310. static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
  1311. return has_an_element_in_specific_scope(parser, 1, &tag, false,
  1312. (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
  1313. TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
  1314. TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
  1315. TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
  1316. TAG_SVG(TITLE), TAG(OL), TAG(UL)});
  1317. }
  1318. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
  1319. static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
  1320. return has_an_element_in_specific_scope(parser, 1, &tag, false,
  1321. (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
  1322. TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
  1323. TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
  1324. TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
  1325. TAG_SVG(TITLE), TAG(BUTTON)});
  1326. }
  1327. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
  1328. static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
  1329. return has_an_element_in_specific_scope(parser, 1, &tag, false,
  1330. (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
  1331. }
  1332. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
  1333. static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
  1334. return has_an_element_in_specific_scope(
  1335. parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
  1336. }
  1337. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
  1338. // "exception" is the "element to exclude from the process" listed in the spec.
  1339. // Pass GUMBO_TAG_LAST to not exclude any of them.
  1340. static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
  1341. for (; node_tag_in_set(get_current_node(parser),
  1342. (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
  1343. TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) &&
  1344. !node_html_tag_is(get_current_node(parser), exception);
  1345. pop_current_node(parser))
  1346. ;
  1347. }
  1348. // This is the "generate all implied end tags thoroughly" clause of the spec.
  1349. // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
  1350. static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
  1351. for (
  1352. ; node_tag_in_set(get_current_node(parser),
  1353. (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI),
  1354. TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC),
  1355. TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)});
  1356. pop_current_node(parser))
  1357. ;
  1358. }
  1359. // This factors out the clauses relating to "act as if an end tag token with tag
  1360. // name "table" had been seen. Returns true if there's a table element in table
  1361. // scope which was successfully closed, false if not and the token should be
  1362. // ignored. Does not add parse errors; callers should handle that.
  1363. static bool close_table(GumboParser* parser) {
  1364. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
  1365. return false;
  1366. }
  1367. GumboNode* node = pop_current_node(parser);
  1368. while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
  1369. node = pop_current_node(parser);
  1370. }
  1371. reset_insertion_mode_appropriately(parser);
  1372. return true;
  1373. }
  1374. // This factors out the clauses relating to "act as if an end tag token with tag
  1375. // name `cell_tag` had been seen".
  1376. static bool close_table_cell(
  1377. GumboParser* parser, const GumboToken* token, GumboTag cell_tag) {
  1378. bool result = true;
  1379. generate_implied_end_tags(parser, GUMBO_TAG_LAST);
  1380. const GumboNode* node = get_current_node(parser);
  1381. if (!node_html_tag_is(node, cell_tag)) {
  1382. parser_add_parse_error(parser, token);
  1383. result = false;
  1384. }
  1385. do {
  1386. node = pop_current_node(parser);
  1387. } while (!node_html_tag_is(node, cell_tag));
  1388. clear_active_formatting_elements(parser);
  1389. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
  1390. return result;
  1391. }
  1392. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
  1393. // This holds the logic to determine whether we should close a <td> or a <th>.
  1394. static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
  1395. if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
  1396. assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
  1397. return close_table_cell(parser, token, GUMBO_TAG_TD);
  1398. } else {
  1399. assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
  1400. return close_table_cell(parser, token, GUMBO_TAG_TH);
  1401. }
  1402. }
  1403. // This factors out the "act as if an end tag of tag name 'select' had been
  1404. // seen" clause of the spec, since it's referenced in several places. It pops
  1405. // all nodes from the stack until the current <select> has been closed, then
  1406. // resets the insertion mode appropriately.
  1407. static void close_current_select(GumboParser* parser) {
  1408. GumboNode* node = pop_current_node(parser);
  1409. while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
  1410. node = pop_current_node(parser);
  1411. }
  1412. reset_insertion_mode_appropriately(parser);
  1413. }
  1414. // The list of nodes in the "special" category:
  1415. // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
  1416. static bool is_special_node(const GumboNode* node) {
  1417. assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
  1418. return node_tag_in_set(node,
  1419. (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
  1420. TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
  1421. TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
  1422. TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
  1423. TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
  1424. TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
  1425. TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
  1426. TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
  1427. TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
  1428. TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
  1429. TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
  1430. TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
  1431. TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
  1432. TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
  1433. TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
  1434. TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
  1435. TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
  1436. TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)});
  1437. }
  1438. // Implicitly closes currently open elements until it reaches an element with
  1439. // the
  1440. // specified qualified name. If the elements closed are in the set handled by
  1441. // generate_implied_end_tags, this is normal operation and this function returns
  1442. // true. Otherwise, a parse error is recorded and this function returns false.
  1443. static bool implicitly_close_tags(GumboParser* parser, GumboToken* token,
  1444. GumboNamespaceEnum target_ns, GumboTag target) {
  1445. bool result = true;
  1446. generate_implied_end_tags(parser, target);
  1447. if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
  1448. parser_add_parse_error(parser, token);
  1449. while (
  1450. !node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
  1451. pop_current_node(parser);
  1452. }
  1453. result = false;
  1454. }
  1455. assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
  1456. pop_current_node(parser);
  1457. return result;
  1458. }
  1459. // If the stack of open elements has a <p> tag in button scope, this acts as if
  1460. // a </p> tag was encountered, implicitly closing tags. Returns false if a
  1461. // parse error occurs. This is a convenience function because this particular
  1462. // clause appears several times in the spec.
  1463. static bool maybe_implicitly_close_p_tag(
  1464. GumboParser* parser, GumboToken* token) {
  1465. if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
  1466. return implicitly_close_tags(
  1467. parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
  1468. }
  1469. return true;
  1470. }
  1471. // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
  1472. // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
  1473. static void maybe_implicitly_close_list_tag(
  1474. GumboParser* parser, GumboToken* token, bool is_li) {
  1475. GumboParserState* state = parser->_parser_state;
  1476. state->_frameset_ok = false;
  1477. for (int i = state->_open_elements.length; --i >= 0;) {
  1478. const GumboNode* node = state->_open_elements.data[i];
  1479. bool is_list_tag =
  1480. is_li ? node_html_tag_is(node, GUMBO_TAG_LI)
  1481. : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)});
  1482. if (is_list_tag) {
  1483. implicitly_close_tags(
  1484. parser, token, node->v.element.tag_namespace, node->v.element.tag);
  1485. return;
  1486. }
  1487. if (is_special_node(node) &&
  1488. !node_tag_in_set(
  1489. node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) {
  1490. return;
  1491. }
  1492. }
  1493. }
  1494. static void merge_attributes(
  1495. GumboParser* parser, GumboToken* token, GumboNode* node) {
  1496. assert(token->type == GUMBO_TOKEN_START_TAG);
  1497. assert(node->type == GUMBO_NODE_ELEMENT);
  1498. const GumboVector* token_attr = &token->v.start_tag.attributes;
  1499. GumboVector* node_attr = &node->v.element.attributes;
  1500. for (unsigned int i = 0; i < token_attr->length; ++i) {
  1501. GumboAttribute* attr = token_attr->data[i];
  1502. if (!gumbo_get_attribute(node_attr, attr->name)) {
  1503. // Ownership of the attribute is transferred by this gumbo_vector_add,
  1504. // so it has to be nulled out of the original token so it doesn't get
  1505. // double-deleted.
  1506. gumbo_vector_add(parser, attr, node_attr);
  1507. token_attr->data[i] = NULL;
  1508. }
  1509. }
  1510. // When attributes are merged, it means the token has been ignored and merged
  1511. // with another token, so we need to free its memory. The attributes that are
  1512. // transferred need to be nulled-out in the vector above so that they aren't
  1513. // double-deleted.
  1514. gumbo_token_destroy(parser, token);
  1515. #ifndef NDEBUG
  1516. // Mark this sentinel so the assertion in the main loop knows it's been
  1517. // destroyed.
  1518. token->v.start_tag.attributes = kGumboEmptyVector;
  1519. #endif
  1520. }
  1521. const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
  1522. for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry);
  1523. ++i) {
  1524. const ReplacementEntry* entry = &kSvgTagReplacements[i];
  1525. if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
  1526. return entry->to.data;
  1527. }
  1528. }
  1529. return NULL;
  1530. }
  1531. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
  1532. // This destructively modifies any matching attributes on the token and sets the
  1533. // namespace appropriately.
  1534. static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
  1535. assert(token->type == GUMBO_TOKEN_START_TAG);
  1536. const GumboVector* attributes = &token->v.start_tag.attributes;
  1537. for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) /
  1538. sizeof(NamespacedAttributeReplacement);
  1539. ++i) {
  1540. const NamespacedAttributeReplacement* entry =
  1541. &kForeignAttributeReplacements[i];
  1542. GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
  1543. if (!attr) {
  1544. continue;
  1545. }
  1546. gumbo_parser_deallocate(parser, (void*) attr->name);
  1547. attr->attr_namespace = entry->attr_namespace;
  1548. attr->name = gumbo_copy_stringz(parser, entry->local_name);
  1549. }
  1550. }
  1551. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
  1552. // This destructively modifies any matching attributes on the token.
  1553. static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
  1554. assert(token->type == GUMBO_TOKEN_START_TAG);
  1555. const GumboVector* attributes = &token->v.start_tag.attributes;
  1556. for (size_t i = 0;
  1557. i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
  1558. const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
  1559. GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
  1560. if (!attr) {
  1561. continue;
  1562. }
  1563. gumbo_parser_deallocate(parser, (void*) attr->name);
  1564. attr->name = gumbo_copy_stringz(parser, entry->to.data);
  1565. }
  1566. }
  1567. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
  1568. // Note that this may destructively modify the token with the new attribute
  1569. // value.
  1570. static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
  1571. assert(token->type == GUMBO_TOKEN_START_TAG);
  1572. GumboAttribute* attr =
  1573. gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl");
  1574. if (!attr) {
  1575. return;
  1576. }
  1577. gumbo_parser_deallocate(parser, (void*) attr->name);
  1578. attr->name = gumbo_copy_stringz(parser, "definitionURL");
  1579. }
  1580. static bool doctype_matches(const GumboTokenDocType* doctype,
  1581. const GumboStringPiece* public_id, const GumboStringPiece* system_id,
  1582. bool allow_missing_system_id) {
  1583. return !strcmp(doctype->public_identifier, public_id->data) &&
  1584. (allow_missing_system_id || doctype->has_system_identifier) &&
  1585. !strcmp(doctype->system_identifier, system_id->data);
  1586. }
  1587. static bool maybe_add_doctype_error(
  1588. GumboParser* parser, const GumboToken* token) {
  1589. const GumboTokenDocType* doctype = &token->v.doc_type;
  1590. bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
  1591. if ((!html_doctype || doctype->has_public_identifier ||
  1592. (doctype->has_system_identifier &&
  1593. !strcmp(
  1594. doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
  1595. !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
  1596. &kSystemIdRecHtml4_0, true) ||
  1597. doctype_matches(doctype, &kPublicIdHtml4_01,
  1598. &kSystemIdHtml4, true) ||
  1599. doctype_matches(doctype, &kPublicIdXhtml1_0,
  1600. &kSystemIdXhtmlStrict1_1, false) ||
  1601. doctype_matches(doctype, &kPublicIdXhtml1_1,
  1602. &kSystemIdXhtml1_1, false)))) {
  1603. parser_add_parse_error(parser, token);
  1604. return false;
  1605. }
  1606. return true;
  1607. }
  1608. static void remove_from_parent(GumboParser* parser, GumboNode* node) {
  1609. if (!node->parent) {
  1610. // The node may not have a parent if, for example, it is a newly-cloned copy
  1611. // of an active formatting element. DOM manipulations continue with the
  1612. // orphaned fragment of the DOM tree until it's appended/foster-parented to
  1613. // the common ancestor at the end of the adoption agency algorithm.
  1614. return;
  1615. }
  1616. assert(node->parent->type == GUMBO_NODE_ELEMENT);
  1617. GumboVector* children = &node->parent->v.element.children;
  1618. int index = gumbo_vector_index_of(children, node);
  1619. assert(index != -1);
  1620. gumbo_vector_remove_at(parser, index, children);
  1621. node->parent = NULL;
  1622. node->index_within_parent = -1;
  1623. for (unsigned int i = index; i < children->length; ++i) {
  1624. GumboNode* child = children->data[i];
  1625. child->index_within_parent = i;
  1626. }
  1627. }
  1628. // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
  1629. // Also described in the "in body" handling for end formatting tags.
  1630. static bool adoption_agency_algorithm(
  1631. GumboParser* parser, GumboToken* token, GumboTag subject) {
  1632. GumboParserState* state = parser->_parser_state;
  1633. gumbo_debug("Entering adoption agency algorithm.\n");
  1634. // Step 1.
  1635. GumboNode* current_node = get_current_node(parser);
  1636. if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
  1637. current_node->v.element.tag == subject &&
  1638. gumbo_vector_index_of(
  1639. &state->_active_formatting_elements, current_node) == -1) {
  1640. pop_current_node(parser);
  1641. return false;
  1642. }
  1643. // Steps 2-4 & 20:
  1644. for (unsigned int i = 0; i < 8; ++i) {
  1645. // Step 5.
  1646. GumboNode* formatting_node = NULL;
  1647. int formatting_node_in_open_elements = -1;
  1648. for (int j = state->_active_formatting_elements.length; --j >= 0;) {
  1649. GumboNode* current_node = state->_active_formatting_elements.data[j];
  1650. if (current_node == &kActiveFormattingScopeMarker) {
  1651. gumbo_debug("Broke on scope marker; aborting.\n");
  1652. // Last scope marker; abort the algorithm.
  1653. return false;
  1654. }
  1655. if (node_html_tag_is(current_node, subject)) {
  1656. // Found it.
  1657. formatting_node = current_node;
  1658. formatting_node_in_open_elements =
  1659. gumbo_vector_index_of(&state->_open_elements, formatting_node);
  1660. gumbo_debug("Formatting element of tag %s at %d.\n",
  1661. gumbo_normalized_tagname(subject),
  1662. formatting_node_in_open_elements);
  1663. break;
  1664. }
  1665. }
  1666. if (!formatting_node) {
  1667. // No matching tag; not a parse error outright, but fall through to the
  1668. // "any other end tag" clause (which may potentially add a parse error,
  1669. // but not always).
  1670. gumbo_debug("No active formatting elements; aborting.\n");
  1671. return false;
  1672. }
  1673. // Step 6
  1674. if (formatting_node_in_open_elements == -1) {
  1675. gumbo_debug("Formatting node not on stack of open elements.\n");
  1676. parser_add_parse_error(parser, token);
  1677. gumbo_vector_remove(
  1678. parser, formatting_node, &state->_active_formatting_elements);
  1679. return false;
  1680. }
  1681. // Step 7
  1682. if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
  1683. parser_add_parse_error(parser, token);
  1684. gumbo_debug("Element not in scope.\n");
  1685. return false;
  1686. }
  1687. // Step 8
  1688. if (formatting_node != get_current_node(parser)) {
  1689. parser_add_parse_error(parser, token); // But continue onwards.
  1690. }
  1691. assert(formatting_node);
  1692. assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
  1693. assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
  1694. // Step 9 & 10
  1695. GumboNode* furthest_block = NULL;
  1696. for (unsigned int j = formatting_node_in_open_elements;
  1697. j < state->_open_elements.length; ++j) {
  1698. assert(j > 0);
  1699. GumboNode* current = state->_open_elements.data[j];
  1700. if (is_special_node(current)) {
  1701. // Step 9.
  1702. furthest_block = current;
  1703. break;
  1704. }
  1705. }
  1706. if (!furthest_block) {
  1707. // Step 10.
  1708. while (get_current_node(parser) != formatting_node) {
  1709. pop_current_node(parser);
  1710. }
  1711. // And the formatting element itself.
  1712. pop_current_node(parser);
  1713. gumbo_vector_remove(
  1714. parser, formatting_node, &state->_active_formatting_elements);
  1715. return false;
  1716. }
  1717. assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
  1718. assert(furthest_block);
  1719. // Step 11.
  1720. // Elements may be moved and reparented by this algorithm, so
  1721. // common_ancestor is not necessarily the same as formatting_node->parent.
  1722. GumboNode* common_ancestor =
  1723. state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements,
  1724. formatting_node) -
  1725. 1];
  1726. gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
  1727. gumbo_normalized_tagname(common_ancestor->v.element.tag),
  1728. gumbo_normalized_tagname(furthest_block->v.element.tag));
  1729. // Step 12.
  1730. int bookmark = gumbo_vector_index_of(
  1731. &state->_active_formatting_elements, formatting_node) +
  1732. 1;
  1733. gumbo_debug("Bookmark at %d.\n", bookmark);
  1734. // Step 13.
  1735. GumboNode* node = furthest_block;
  1736. GumboNode* last_node = furthest_block;
  1737. // Must be stored explicitly, in case node is removed from the stack of open
  1738. // elements, to handle step 9.4.
  1739. int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
  1740. assert(saved_node_index > 0);
  1741. // Step 13.1.
  1742. for (int j = 0;;) {
  1743. // Step 13.2.
  1744. ++j;
  1745. // Step 13.3.
  1746. int node_index = gumbo_vector_index_of(&state->_open_elements, node);
  1747. gumbo_debug(
  1748. "Current index: %d, last index: %d.\n", node_index, saved_node_index);
  1749. if (node_index == -1) {
  1750. node_index = saved_node_index;
  1751. }
  1752. saved_node_index = --node_index;
  1753. assert(node_index > 0);
  1754. assert((unsigned int) node_index < state->_open_elements.capacity);
  1755. node = state->_open_elements.data[node_index];
  1756. assert(node->parent);
  1757. if (node == formatting_node) {
  1758. // Step 13.4.
  1759. break;
  1760. }
  1761. int formatting_index =
  1762. gumbo_vector_index_of(&state->_active_formatting_elements, node);
  1763. if (j > 3 && formatting_index != -1) {
  1764. // Step 13.5.
  1765. gumbo_debug("Removing formatting element at %d.\n", formatting_index);
  1766. gumbo_vector_remove_at(
  1767. parser, formatting_index, &state->_active_formatting_elements);
  1768. // Removing the element shifts all indices over by one, so we may need
  1769. // to move the bookmark.
  1770. if (formatting_index < bookmark) {
  1771. --bookmark;
  1772. gumbo_debug("Moving bookmark to %d.\n", bookmark);
  1773. }
  1774. continue;
  1775. }
  1776. if (formatting_index == -1) {
  1777. // Step 13.6.
  1778. gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
  1779. continue;
  1780. }
  1781. // Step 13.7.
  1782. // "common ancestor as the intended parent" doesn't actually mean insert
  1783. // it into the common ancestor; that happens below.
  1784. node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
  1785. assert(formatting_index >= 0);
  1786. state->_active_formatting_elements.data[formatting_index] = node;
  1787. assert(node_index >= 0);
  1788. state->_open_elements.data[node_index] = node;
  1789. // Step 13.8.
  1790. if (last_node == furthest_block) {
  1791. bookmark = formatting_index + 1;
  1792. gumbo_debug("Bookmark moved to %d.\n", bookmark);
  1793. assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
  1794. }
  1795. // Step 13.9.
  1796. last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
  1797. remove_from_parent(parser, last_node);
  1798. append_node(parser, node, last_node);
  1799. // Step 13.10.
  1800. last_node = node;
  1801. } // Step 13.11.
  1802. // Step 14.
  1803. gumbo_debug("Removing %s node from parent ",
  1804. gumbo_normalized_tagname(last_node->v.element.tag));
  1805. remove_from_parent(parser, last_node);
  1806. last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
  1807. InsertionLocation location =
  1808. get_appropriate_insertion_location(parser, common_ancestor);
  1809. gumbo_debug("and inserting it into %s.\n",
  1810. gumbo_normalized_tagname(location.target->v.element.tag));
  1811. insert_node(parser, last_node, location);
  1812. // Step 15.
  1813. GumboNode* new_formatting_node = clone_node(
  1814. parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
  1815. formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
  1816. // Step 16. Instead of appending nodes one-by-one, we swap the children
  1817. // vector of furthest_block with the empty children of new_formatting_node,
  1818. // reducing memory traffic and allocations. We still have to reset their
  1819. // parent pointers, though.
  1820. GumboVector temp = new_formatting_node->v.element.children;
  1821. new_formatting_node->v.element.children =
  1822. furthest_block->v.element.children;
  1823. furthest_block->v.element.children = temp;
  1824. temp = new_formatting_node->v.element.children;
  1825. for (unsigned int i = 0; i < temp.length; ++i) {
  1826. GumboNode* child = temp.data[i];
  1827. child->parent = new_formatting_node;
  1828. }
  1829. // Step 17.
  1830. append_node(parser, furthest_block, new_formatting_node);
  1831. // Step 18.
  1832. // If the formatting node was before the bookmark, it may shift over all
  1833. // indices after it, so we need to explicitly find the index and possibly
  1834. // adjust the bookmark.
  1835. int formatting_node_index = gumbo_vector_index_of(
  1836. &state->_active_formatting_elements, formatting_node);
  1837. assert(formatting_node_index != -1);
  1838. if (formatting_node_index < bookmark) {
  1839. gumbo_debug(
  1840. "Formatting node at %d is before bookmark at %d; decrementing.\n",
  1841. formatting_node_index, bookmark);
  1842. --bookmark;
  1843. }
  1844. gumbo_vector_remove_at(
  1845. parser, formatting_node_index, &state->_active_formatting_elements);
  1846. assert(bookmark >= 0);
  1847. assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
  1848. gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
  1849. &state->_active_formatting_elements);
  1850. // Step 19.
  1851. gumbo_vector_remove(parser, formatting_node, &state->_open_elements);
  1852. int insert_at =
  1853. gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1;
  1854. assert(insert_at >= 0);
  1855. assert((unsigned int) insert_at <= state->_open_elements.length);
  1856. gumbo_vector_insert_at(
  1857. parser, new_formatting_node, insert_at, &state->_open_elements);
  1858. } // Step 20.
  1859. return true;
  1860. }
  1861. // This is here to clean up memory when the spec says "Ignore current token."
  1862. static void ignore_token(GumboParser* parser) {
  1863. GumboToken* token = parser->_parser_state->_current_token;
  1864. // Ownership of the token's internal buffers are normally transferred to the
  1865. // element, but if no element is emitted (as happens in non-verbatim-mode
  1866. // when a token is ignored), we need to free it here to prevent a memory
  1867. // leak.
  1868. gumbo_token_destroy(parser, token);
  1869. #ifndef NDEBUG
  1870. if (token->type == GUMBO_TOKEN_START_TAG) {
  1871. // Mark this sentinel so the assertion in the main loop knows it's been
  1872. // destroyed.
  1873. token->v.start_tag.attributes = kGumboEmptyVector;
  1874. }
  1875. #endif
  1876. }
  1877. // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
  1878. static void finish_parsing(GumboParser* parser) {
  1879. gumbo_debug("Finishing parsing");
  1880. maybe_flush_text_node_buffer(parser);
  1881. GumboParserState* state = parser->_parser_state;
  1882. for (GumboNode* node = pop_current_node(parser); node;
  1883. node = pop_current_node(parser)) {
  1884. if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
  1885. (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
  1886. continue;
  1887. }
  1888. node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
  1889. }
  1890. while (pop_current_node(parser))
  1891. ; // Pop them all.
  1892. }
  1893. static bool handle_initial(GumboParser* parser, GumboToken* token) {
  1894. GumboDocument* document = &get_document_node(parser)->v.document;
  1895. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  1896. ignore_token(parser);
  1897. return true;
  1898. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  1899. append_comment_node(parser, get_document_node(parser), token);
  1900. return true;
  1901. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  1902. document->has_doctype = true;
  1903. document->name = token->v.doc_type.name;
  1904. document->public_identifier = token->v.doc_type.public_identifier;
  1905. document->system_identifier = token->v.doc_type.system_identifier;
  1906. document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
  1907. set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
  1908. return maybe_add_doctype_error(parser, token);
  1909. }
  1910. parser_add_parse_error(parser, token);
  1911. document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
  1912. set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
  1913. parser->_parser_state->_reprocess_current_token = true;
  1914. return true;
  1915. }
  1916. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
  1917. static bool handle_before_html(GumboParser* parser, GumboToken* token) {
  1918. if (token->type == GUMBO_TOKEN_DOCTYPE) {
  1919. parser_add_parse_error(parser, token);
  1920. ignore_token(parser);
  1921. return false;
  1922. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  1923. append_comment_node(parser, get_document_node(parser), token);
  1924. return true;
  1925. } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
  1926. ignore_token(parser);
  1927. return true;
  1928. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  1929. GumboNode* html_node = insert_element_from_token(parser, token);
  1930. parser->_output->root = html_node;
  1931. set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
  1932. return true;
  1933. } else if (token->type == GUMBO_TOKEN_END_TAG &&
  1934. !tag_in(token, false,
  1935. (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
  1936. parser_add_parse_error(parser, token);
  1937. ignore_token(parser);
  1938. return false;
  1939. } else {
  1940. GumboNode* html_node = insert_element_of_tag_type(
  1941. parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
  1942. assert(html_node);
  1943. parser->_output->root = html_node;
  1944. set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
  1945. parser->_parser_state->_reprocess_current_token = true;
  1946. return true;
  1947. }
  1948. }
  1949. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
  1950. static bool handle_before_head(GumboParser* parser, GumboToken* token) {
  1951. if (token->type == GUMBO_TOKEN_DOCTYPE) {
  1952. parser_add_parse_error(parser, token);
  1953. ignore_token(parser);
  1954. return false;
  1955. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  1956. append_comment_node(parser, get_current_node(parser), token);
  1957. return true;
  1958. } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
  1959. ignore_token(parser);
  1960. return true;
  1961. } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
  1962. GumboNode* node = insert_element_from_token(parser, token);
  1963. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
  1964. parser->_parser_state->_head_element = node;
  1965. return true;
  1966. } else if (token->type == GUMBO_TOKEN_END_TAG &&
  1967. !tag_in(token, false,
  1968. (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
  1969. parser_add_parse_error(parser, token);
  1970. ignore_token(parser);
  1971. return false;
  1972. } else {
  1973. GumboNode* node = insert_element_of_tag_type(
  1974. parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
  1975. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
  1976. parser->_parser_state->_head_element = node;
  1977. parser->_parser_state->_reprocess_current_token = true;
  1978. return true;
  1979. }
  1980. }
  1981. // Forward declarations because of mutual dependencies.
  1982. static bool handle_token(GumboParser* parser, GumboToken* token);
  1983. static bool handle_in_body(GumboParser* parser, GumboToken* token);
  1984. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
  1985. static bool handle_in_head(GumboParser* parser, GumboToken* token) {
  1986. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  1987. insert_text_token(parser, token);
  1988. return true;
  1989. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  1990. parser_add_parse_error(parser, token);
  1991. ignore_token(parser);
  1992. return false;
  1993. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  1994. append_comment_node(parser, get_current_node(parser), token);
  1995. return true;
  1996. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  1997. return handle_in_body(parser, token);
  1998. } else if (tag_in(token, kStartTag,
  1999. (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
  2000. TAG(MENUITEM), TAG(LINK)})) {
  2001. insert_element_from_token(parser, token);
  2002. pop_current_node(parser);
  2003. acknowledge_self_closing_tag(parser);
  2004. return true;
  2005. } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
  2006. insert_element_from_token(parser, token);
  2007. pop_current_node(parser);
  2008. acknowledge_self_closing_tag(parser);
  2009. // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
  2010. // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
  2011. // should specifically look for that string in the document and re-encode it
  2012. // before passing to Gumbo.
  2013. return true;
  2014. } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
  2015. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
  2016. return true;
  2017. } else if (tag_in(
  2018. token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) {
  2019. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
  2020. return true;
  2021. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
  2022. insert_element_from_token(parser, token);
  2023. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
  2024. return true;
  2025. } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
  2026. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
  2027. return true;
  2028. } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
  2029. GumboNode* head = pop_current_node(parser);
  2030. AVOID_UNUSED_VARIABLE_WARNING(head);
  2031. assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
  2032. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
  2033. return true;
  2034. } else if (tag_in(token, kEndTag,
  2035. (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) {
  2036. pop_current_node(parser);
  2037. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
  2038. parser->_parser_state->_reprocess_current_token = true;
  2039. return true;
  2040. } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
  2041. insert_element_from_token(parser, token);
  2042. add_formatting_element(parser, &kActiveFormattingScopeMarker);
  2043. parser->_parser_state->_frameset_ok = false;
  2044. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
  2045. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
  2046. return true;
  2047. } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  2048. if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2049. parser_add_parse_error(parser, token);
  2050. ignore_token(parser);
  2051. return false;
  2052. }
  2053. generate_all_implied_end_tags_thoroughly(parser);
  2054. bool success = true;
  2055. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
  2056. parser_add_parse_error(parser, token);
  2057. success = false;
  2058. }
  2059. while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
  2060. ;
  2061. clear_active_formatting_elements(parser);
  2062. pop_template_insertion_mode(parser);
  2063. reset_insertion_mode_appropriately(parser);
  2064. return success;
  2065. } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
  2066. (token->type == GUMBO_TOKEN_END_TAG)) {
  2067. parser_add_parse_error(parser, token);
  2068. ignore_token(parser);
  2069. return false;
  2070. } else {
  2071. pop_current_node(parser);
  2072. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
  2073. parser->_parser_state->_reprocess_current_token = true;
  2074. return true;
  2075. }
  2076. return true;
  2077. }
  2078. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
  2079. static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
  2080. if (token->type == GUMBO_TOKEN_DOCTYPE) {
  2081. parser_add_parse_error(parser, token);
  2082. return false;
  2083. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  2084. return handle_in_body(parser, token);
  2085. } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
  2086. const GumboNode* node = pop_current_node(parser);
  2087. assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
  2088. AVOID_UNUSED_VARIABLE_WARNING(node);
  2089. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
  2090. return true;
  2091. } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
  2092. token->type == GUMBO_TOKEN_COMMENT ||
  2093. tag_in(token, kStartTag,
  2094. (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
  2095. TAG(META), TAG(NOFRAMES), TAG(STYLE)})) {
  2096. return handle_in_head(parser, token);
  2097. } else if (tag_in(
  2098. token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) ||
  2099. (token->type == GUMBO_TOKEN_END_TAG &&
  2100. !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
  2101. parser_add_parse_error(parser, token);
  2102. ignore_token(parser);
  2103. return false;
  2104. } else {
  2105. parser_add_parse_error(parser, token);
  2106. const GumboNode* node = pop_current_node(parser);
  2107. assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
  2108. AVOID_UNUSED_VARIABLE_WARNING(node);
  2109. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
  2110. parser->_parser_state->_reprocess_current_token = true;
  2111. return false;
  2112. }
  2113. }
  2114. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
  2115. static bool handle_after_head(GumboParser* parser, GumboToken* token) {
  2116. GumboParserState* state = parser->_parser_state;
  2117. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  2118. insert_text_token(parser, token);
  2119. return true;
  2120. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  2121. parser_add_parse_error(parser, token);
  2122. ignore_token(parser);
  2123. return false;
  2124. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  2125. append_comment_node(parser, get_current_node(parser), token);
  2126. return true;
  2127. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  2128. return handle_in_body(parser, token);
  2129. } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
  2130. insert_element_from_token(parser, token);
  2131. state->_frameset_ok = false;
  2132. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  2133. return true;
  2134. } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
  2135. insert_element_from_token(parser, token);
  2136. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
  2137. return true;
  2138. } else if (tag_in(token, kStartTag,
  2139. (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
  2140. TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
  2141. TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) {
  2142. parser_add_parse_error(parser, token);
  2143. assert(state->_head_element != NULL);
  2144. // This must be flushed before we push the head element on, as there may be
  2145. // pending character tokens that should be attached to the root.
  2146. maybe_flush_text_node_buffer(parser);
  2147. gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
  2148. bool result = handle_in_head(parser, token);
  2149. gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
  2150. return result;
  2151. } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  2152. return handle_in_head(parser, token);
  2153. } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
  2154. (token->type == GUMBO_TOKEN_END_TAG &&
  2155. !tag_in(token, kEndTag,
  2156. (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) {
  2157. parser_add_parse_error(parser, token);
  2158. ignore_token(parser);
  2159. return false;
  2160. } else {
  2161. insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
  2162. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  2163. state->_reprocess_current_token = true;
  2164. return true;
  2165. }
  2166. }
  2167. static void destroy_node(GumboParser* parser, GumboNode* node) {
  2168. switch (node->type) {
  2169. case GUMBO_NODE_DOCUMENT: {
  2170. GumboDocument* doc = &node->v.document;
  2171. for (unsigned int i = 0; i < doc->children.length; ++i) {
  2172. destroy_node(parser, doc->children.data[i]);
  2173. }
  2174. gumbo_parser_deallocate(parser, (void*) doc->children.data);
  2175. gumbo_parser_deallocate(parser, (void*) doc->name);
  2176. gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
  2177. gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
  2178. } break;
  2179. case GUMBO_NODE_TEMPLATE:
  2180. case GUMBO_NODE_ELEMENT:
  2181. for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
  2182. gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
  2183. }
  2184. gumbo_parser_deallocate(parser, node->v.element.attributes.data);
  2185. for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
  2186. destroy_node(parser, node->v.element.children.data[i]);
  2187. }
  2188. gumbo_parser_deallocate(parser, node->v.element.children.data);
  2189. break;
  2190. case GUMBO_NODE_TEXT:
  2191. case GUMBO_NODE_CDATA:
  2192. case GUMBO_NODE_COMMENT:
  2193. case GUMBO_NODE_WHITESPACE:
  2194. gumbo_parser_deallocate(parser, (void*) node->v.text.text);
  2195. break;
  2196. }
  2197. gumbo_parser_deallocate(parser, node);
  2198. }
  2199. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
  2200. static bool handle_in_body(GumboParser* parser, GumboToken* token) {
  2201. GumboParserState* state = parser->_parser_state;
  2202. assert(state->_open_elements.length > 0);
  2203. if (token->type == GUMBO_TOKEN_NULL) {
  2204. parser_add_parse_error(parser, token);
  2205. ignore_token(parser);
  2206. return false;
  2207. } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
  2208. reconstruct_active_formatting_elements(parser);
  2209. insert_text_token(parser, token);
  2210. return true;
  2211. } else if (token->type == GUMBO_TOKEN_CHARACTER ||
  2212. token->type == GUMBO_TOKEN_CDATA) {
  2213. reconstruct_active_formatting_elements(parser);
  2214. insert_text_token(parser, token);
  2215. set_frameset_not_ok(parser);
  2216. return true;
  2217. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  2218. append_comment_node(parser, get_current_node(parser), token);
  2219. return true;
  2220. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  2221. parser_add_parse_error(parser, token);
  2222. ignore_token(parser);
  2223. return false;
  2224. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  2225. parser_add_parse_error(parser, token);
  2226. if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2227. ignore_token(parser);
  2228. return false;
  2229. }
  2230. assert(parser->_output->root != NULL);
  2231. assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
  2232. merge_attributes(parser, token, parser->_output->root);
  2233. return false;
  2234. } else if (tag_in(token, kStartTag,
  2235. (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
  2236. TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
  2237. TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
  2238. tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  2239. return handle_in_head(parser, token);
  2240. } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
  2241. parser_add_parse_error(parser, token);
  2242. if (state->_open_elements.length < 2 ||
  2243. !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
  2244. has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2245. ignore_token(parser);
  2246. return false;
  2247. }
  2248. state->_frameset_ok = false;
  2249. merge_attributes(parser, token, state->_open_elements.data[1]);
  2250. return false;
  2251. } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
  2252. parser_add_parse_error(parser, token);
  2253. if (state->_open_elements.length < 2 ||
  2254. !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
  2255. !state->_frameset_ok) {
  2256. ignore_token(parser);
  2257. return false;
  2258. }
  2259. // Save the body node for later removal.
  2260. GumboNode* body_node = state->_open_elements.data[1];
  2261. // Pop all nodes except root HTML element.
  2262. GumboNode* node;
  2263. do {
  2264. node = pop_current_node(parser);
  2265. } while (node != state->_open_elements.data[1]);
  2266. // Removing & destroying the body node is going to kill any nodes that have
  2267. // been added to the list of active formatting elements, and so we should
  2268. // clear it to prevent a use-after-free if the list of active formatting
  2269. // elements is reconstructed afterwards. This may happen if whitespace
  2270. // follows the </frameset>.
  2271. clear_active_formatting_elements(parser);
  2272. // Remove the body node. We may want to factor this out into a generic
  2273. // helper, but right now this is the only code that needs to do this.
  2274. GumboVector* children = &parser->_output->root->v.element.children;
  2275. for (unsigned int i = 0; i < children->length; ++i) {
  2276. if (children->data[i] == body_node) {
  2277. gumbo_vector_remove_at(parser, i, children);
  2278. break;
  2279. }
  2280. }
  2281. destroy_node(parser, body_node);
  2282. // Insert the <frameset>, and switch the insertion mode.
  2283. insert_element_from_token(parser, token);
  2284. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
  2285. return true;
  2286. } else if (token->type == GUMBO_TOKEN_EOF) {
  2287. for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
  2288. if (!node_tag_in_set(state->_open_elements.data[i],
  2289. (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY),
  2290. TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY),
  2291. TAG(HTML)})) {
  2292. parser_add_parse_error(parser, token);
  2293. }
  2294. }
  2295. if (get_current_template_insertion_mode(parser) !=
  2296. GUMBO_INSERTION_MODE_INITIAL) {
  2297. return handle_in_template(parser, token);
  2298. }
  2299. return true;
  2300. } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) {
  2301. if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
  2302. parser_add_parse_error(parser, token);
  2303. ignore_token(parser);
  2304. return false;
  2305. }
  2306. bool success = true;
  2307. for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
  2308. if (!node_tag_in_set(state->_open_elements.data[i],
  2309. (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
  2310. TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC),
  2311. TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
  2312. TAG(BODY), TAG(HTML)})) {
  2313. parser_add_parse_error(parser, token);
  2314. success = false;
  2315. break;
  2316. }
  2317. }
  2318. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
  2319. if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
  2320. parser->_parser_state->_reprocess_current_token = true;
  2321. } else {
  2322. GumboNode* body = state->_open_elements.data[1];
  2323. assert(node_html_tag_is(body, GUMBO_TAG_BODY));
  2324. record_end_of_element(state->_current_token, &body->v.element);
  2325. }
  2326. return success;
  2327. } else if (tag_in(token, kStartTag,
  2328. (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
  2329. TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR),
  2330. TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
  2331. TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
  2332. TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
  2333. TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
  2334. bool result = maybe_implicitly_close_p_tag(parser, token);
  2335. insert_element_from_token(parser, token);
  2336. return result;
  2337. } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
  2338. TAG(H4), TAG(H5), TAG(H6)})) {
  2339. bool result = maybe_implicitly_close_p_tag(parser, token);
  2340. if (node_tag_in_set(
  2341. get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
  2342. TAG(H4), TAG(H5), TAG(H6)})) {
  2343. parser_add_parse_error(parser, token);
  2344. pop_current_node(parser);
  2345. result = false;
  2346. }
  2347. insert_element_from_token(parser, token);
  2348. return result;
  2349. } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) {
  2350. bool result = maybe_implicitly_close_p_tag(parser, token);
  2351. insert_element_from_token(parser, token);
  2352. state->_ignore_next_linefeed = true;
  2353. state->_frameset_ok = false;
  2354. return result;
  2355. } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
  2356. if (state->_form_element != NULL &&
  2357. !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2358. gumbo_debug("Ignoring nested form.\n");
  2359. parser_add_parse_error(parser, token);
  2360. ignore_token(parser);
  2361. return false;
  2362. }
  2363. bool result = maybe_implicitly_close_p_tag(parser, token);
  2364. GumboNode* form_element = insert_element_from_token(parser, token);
  2365. if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2366. state->_form_element = form_element;
  2367. }
  2368. return result;
  2369. } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
  2370. maybe_implicitly_close_list_tag(parser, token, true);
  2371. bool result = maybe_implicitly_close_p_tag(parser, token);
  2372. insert_element_from_token(parser, token);
  2373. return result;
  2374. } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
  2375. maybe_implicitly_close_list_tag(parser, token, false);
  2376. bool result = maybe_implicitly_close_p_tag(parser, token);
  2377. insert_element_from_token(parser, token);
  2378. return result;
  2379. } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
  2380. bool result = maybe_implicitly_close_p_tag(parser, token);
  2381. insert_element_from_token(parser, token);
  2382. gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
  2383. return result;
  2384. } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
  2385. if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
  2386. parser_add_parse_error(parser, token);
  2387. implicitly_close_tags(
  2388. parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
  2389. state->_reprocess_current_token = true;
  2390. return false;
  2391. }
  2392. reconstruct_active_formatting_elements(parser);
  2393. insert_element_from_token(parser, token);
  2394. state->_frameset_ok = false;
  2395. return true;
  2396. } else if (tag_in(token, kEndTag,
  2397. (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
  2398. TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
  2399. TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
  2400. TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
  2401. TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
  2402. TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
  2403. GumboTag tag = token->v.end_tag;
  2404. if (!has_an_element_in_scope(parser, tag)) {
  2405. parser_add_parse_error(parser, token);
  2406. ignore_token(parser);
  2407. return false;
  2408. }
  2409. implicitly_close_tags(
  2410. parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
  2411. return true;
  2412. } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
  2413. if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2414. if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
  2415. parser_add_parse_error(parser, token);
  2416. ignore_token(parser);
  2417. return false;
  2418. }
  2419. bool success = true;
  2420. generate_implied_end_tags(parser, GUMBO_TAG_LAST);
  2421. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
  2422. parser_add_parse_error(parser, token);
  2423. return false;
  2424. }
  2425. while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
  2426. ;
  2427. return success;
  2428. } else {
  2429. bool result = true;
  2430. const GumboNode* node = state->_form_element;
  2431. assert(!node || node->type == GUMBO_NODE_ELEMENT);
  2432. state->_form_element = NULL;
  2433. if (!node || !has_node_in_scope(parser, node)) {
  2434. gumbo_debug("Closing an unopened form.\n");
  2435. parser_add_parse_error(parser, token);
  2436. ignore_token(parser);
  2437. return false;
  2438. }
  2439. // This differs from implicitly_close_tags because we remove *only* the
  2440. // <form> element; other nodes are left in scope.
  2441. generate_implied_end_tags(parser, GUMBO_TAG_LAST);
  2442. if (get_current_node(parser) != node) {
  2443. parser_add_parse_error(parser, token);
  2444. result = false;
  2445. }
  2446. GumboVector* open_elements = &state->_open_elements;
  2447. int index = gumbo_vector_index_of(open_elements, node);
  2448. assert(index >= 0);
  2449. gumbo_vector_remove_at(parser, index, open_elements);
  2450. return result;
  2451. }
  2452. } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
  2453. if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
  2454. parser_add_parse_error(parser, token);
  2455. // reconstruct_active_formatting_elements(parser);
  2456. insert_element_of_tag_type(
  2457. parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
  2458. state->_reprocess_current_token = true;
  2459. return false;
  2460. }
  2461. return implicitly_close_tags(
  2462. parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
  2463. } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
  2464. if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
  2465. parser_add_parse_error(parser, token);
  2466. ignore_token(parser);
  2467. return false;
  2468. }
  2469. return implicitly_close_tags(
  2470. parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
  2471. } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
  2472. assert(token->type == GUMBO_TOKEN_END_TAG);
  2473. GumboTag token_tag = token->v.end_tag;
  2474. if (!has_an_element_in_scope(parser, token_tag)) {
  2475. parser_add_parse_error(parser, token);
  2476. ignore_token(parser);
  2477. return false;
  2478. }
  2479. return implicitly_close_tags(
  2480. parser, token, GUMBO_NAMESPACE_HTML, token_tag);
  2481. } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
  2482. TAG(H4), TAG(H5), TAG(H6)})) {
  2483. if (!has_an_element_in_scope_with_tagname(
  2484. parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
  2485. GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
  2486. // No heading open; ignore the token entirely.
  2487. parser_add_parse_error(parser, token);
  2488. ignore_token(parser);
  2489. return false;
  2490. } else {
  2491. generate_implied_end_tags(parser, GUMBO_TAG_LAST);
  2492. const GumboNode* current_node = get_current_node(parser);
  2493. bool success = node_html_tag_is(current_node, token->v.end_tag);
  2494. if (!success) {
  2495. // There're children of the heading currently open; close them below and
  2496. // record a parse error.
  2497. // TODO(jdtang): Add a way to distinguish this error case from the one
  2498. // above.
  2499. parser_add_parse_error(parser, token);
  2500. }
  2501. do {
  2502. current_node = pop_current_node(parser);
  2503. } while (!node_tag_in_set(
  2504. current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
  2505. TAG(H4), TAG(H5), TAG(H6)}));
  2506. return success;
  2507. }
  2508. } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
  2509. bool success = true;
  2510. int last_a;
  2511. int has_matching_a = find_last_anchor_index(parser, &last_a);
  2512. if (has_matching_a) {
  2513. assert(has_matching_a == 1);
  2514. parser_add_parse_error(parser, token);
  2515. adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
  2516. // The adoption agency algorithm usually removes all instances of <a>
  2517. // from the list of active formatting elements, but in case it doesn't,
  2518. // we're supposed to do this. (The conditions where it might not are
  2519. // listed in the spec.)
  2520. if (find_last_anchor_index(parser, &last_a)) {
  2521. void* last_element = gumbo_vector_remove_at(
  2522. parser, last_a, &state->_active_formatting_elements);
  2523. gumbo_vector_remove(parser, last_element, &state->_open_elements);
  2524. }
  2525. success = false;
  2526. }
  2527. reconstruct_active_formatting_elements(parser);
  2528. add_formatting_element(parser, insert_element_from_token(parser, token));
  2529. return success;
  2530. } else if (tag_in(token, kStartTag,
  2531. (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT),
  2532. TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG),
  2533. TAG(TT), TAG(U)})) {
  2534. reconstruct_active_formatting_elements(parser);
  2535. add_formatting_element(parser, insert_element_from_token(parser, token));
  2536. return true;
  2537. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
  2538. bool result = true;
  2539. reconstruct_active_formatting_elements(parser);
  2540. if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
  2541. result = false;
  2542. parser_add_parse_error(parser, token);
  2543. adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
  2544. reconstruct_active_formatting_elements(parser);
  2545. }
  2546. insert_element_from_token(parser, token);
  2547. add_formatting_element(parser, get_current_node(parser));
  2548. return result;
  2549. } else if (tag_in(token, kEndTag,
  2550. (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM),
  2551. TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL),
  2552. TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) {
  2553. return adoption_agency_algorithm(parser, token, token->v.end_tag);
  2554. } else if (tag_in(token, kStartTag,
  2555. (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
  2556. reconstruct_active_formatting_elements(parser);
  2557. insert_element_from_token(parser, token);
  2558. add_formatting_element(parser, &kActiveFormattingScopeMarker);
  2559. set_frameset_not_ok(parser);
  2560. return true;
  2561. } else if (tag_in(token, kEndTag,
  2562. (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
  2563. GumboTag token_tag = token->v.end_tag;
  2564. if (!has_an_element_in_table_scope(parser, token_tag)) {
  2565. parser_add_parse_error(parser, token);
  2566. ignore_token(parser);
  2567. return false;
  2568. }
  2569. implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
  2570. clear_active_formatting_elements(parser);
  2571. return true;
  2572. } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
  2573. if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
  2574. GUMBO_DOCTYPE_QUIRKS) {
  2575. maybe_implicitly_close_p_tag(parser, token);
  2576. }
  2577. insert_element_from_token(parser, token);
  2578. set_frameset_not_ok(parser);
  2579. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  2580. return true;
  2581. } else if (tag_in(token, kStartTag,
  2582. (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG),
  2583. TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) {
  2584. bool success = true;
  2585. if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
  2586. success = false;
  2587. parser_add_parse_error(parser, token);
  2588. token->v.start_tag.tag = GUMBO_TAG_IMG;
  2589. }
  2590. reconstruct_active_formatting_elements(parser);
  2591. GumboNode* node = insert_element_from_token(parser, token);
  2592. if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
  2593. success = false;
  2594. parser_add_parse_error(parser, token);
  2595. node->v.element.tag = GUMBO_TAG_IMG;
  2596. node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
  2597. }
  2598. pop_current_node(parser);
  2599. acknowledge_self_closing_tag(parser);
  2600. set_frameset_not_ok(parser);
  2601. return success;
  2602. } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
  2603. if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
  2604. // Must be before the element is inserted, as that takes ownership of the
  2605. // token's attribute vector.
  2606. set_frameset_not_ok(parser);
  2607. }
  2608. reconstruct_active_formatting_elements(parser);
  2609. insert_element_from_token(parser, token);
  2610. pop_current_node(parser);
  2611. acknowledge_self_closing_tag(parser);
  2612. return true;
  2613. } else if (tag_in(token, kStartTag,
  2614. (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) {
  2615. insert_element_from_token(parser, token);
  2616. pop_current_node(parser);
  2617. acknowledge_self_closing_tag(parser);
  2618. return true;
  2619. } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
  2620. bool result = maybe_implicitly_close_p_tag(parser, token);
  2621. insert_element_from_token(parser, token);
  2622. pop_current_node(parser);
  2623. acknowledge_self_closing_tag(parser);
  2624. set_frameset_not_ok(parser);
  2625. return result;
  2626. } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
  2627. parser_add_parse_error(parser, token);
  2628. if (parser->_parser_state->_form_element != NULL &&
  2629. !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2630. ignore_token(parser);
  2631. return false;
  2632. }
  2633. acknowledge_self_closing_tag(parser);
  2634. maybe_implicitly_close_p_tag(parser, token);
  2635. set_frameset_not_ok(parser);
  2636. GumboVector* token_attrs = &token->v.start_tag.attributes;
  2637. GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
  2638. GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
  2639. GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
  2640. GumboNode* form = insert_element_of_tag_type(
  2641. parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
  2642. if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2643. parser->_parser_state->_form_element = form;
  2644. }
  2645. if (action_attr) {
  2646. gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
  2647. }
  2648. insert_element_of_tag_type(
  2649. parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
  2650. pop_current_node(parser); // <hr>
  2651. insert_element_of_tag_type(
  2652. parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
  2653. TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
  2654. text_state->_start_original_text = token->original_text.data;
  2655. text_state->_start_position = token->position;
  2656. text_state->_type = GUMBO_NODE_TEXT;
  2657. if (prompt_attr) {
  2658. int prompt_attr_length = strlen(prompt_attr->value);
  2659. gumbo_string_buffer_destroy(parser, &text_state->_buffer);
  2660. text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
  2661. text_state->_buffer.length = prompt_attr_length;
  2662. text_state->_buffer.capacity = prompt_attr_length + 1;
  2663. gumbo_destroy_attribute(parser, prompt_attr);
  2664. } else {
  2665. GumboStringPiece prompt_text =
  2666. GUMBO_STRING("This is a searchable index. Enter search keywords: ");
  2667. gumbo_string_buffer_append_string(
  2668. parser, &prompt_text, &text_state->_buffer);
  2669. }
  2670. GumboNode* input = insert_element_of_tag_type(
  2671. parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
  2672. for (unsigned int i = 0; i < token_attrs->length; ++i) {
  2673. GumboAttribute* attr = token_attrs->data[i];
  2674. if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
  2675. gumbo_vector_add(parser, attr, &input->v.element.attributes);
  2676. }
  2677. token_attrs->data[i] = NULL;
  2678. }
  2679. // All attributes have been successfully transferred and nulled out at this
  2680. // point, so the call to ignore_token will free the memory for it without
  2681. // touching the attributes.
  2682. ignore_token(parser);
  2683. // The name attribute, if present, should be destroyed since it's ignored
  2684. // when copying over. The action attribute should be kept since it's moved
  2685. // to the form.
  2686. if (name_attr) {
  2687. gumbo_destroy_attribute(parser, name_attr);
  2688. }
  2689. GumboAttribute* name =
  2690. gumbo_parser_allocate(parser, sizeof(GumboAttribute));
  2691. GumboStringPiece name_str = GUMBO_STRING("name");
  2692. GumboStringPiece isindex_str = GUMBO_STRING("isindex");
  2693. name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
  2694. name->name = gumbo_copy_stringz(parser, "name");
  2695. name->value = gumbo_copy_stringz(parser, "isindex");
  2696. name->original_name = name_str;
  2697. name->original_value = isindex_str;
  2698. name->name_start = kGumboEmptySourcePosition;
  2699. name->name_end = kGumboEmptySourcePosition;
  2700. name->value_start = kGumboEmptySourcePosition;
  2701. name->value_end = kGumboEmptySourcePosition;
  2702. gumbo_vector_add(parser, name, &input->v.element.attributes);
  2703. pop_current_node(parser); // <input>
  2704. pop_current_node(parser); // <label>
  2705. insert_element_of_tag_type(
  2706. parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
  2707. pop_current_node(parser); // <hr>
  2708. pop_current_node(parser); // <form>
  2709. if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2710. parser->_parser_state->_form_element = NULL;
  2711. }
  2712. return false;
  2713. } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
  2714. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
  2715. parser->_parser_state->_ignore_next_linefeed = true;
  2716. set_frameset_not_ok(parser);
  2717. return true;
  2718. } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
  2719. bool result = maybe_implicitly_close_p_tag(parser, token);
  2720. reconstruct_active_formatting_elements(parser);
  2721. set_frameset_not_ok(parser);
  2722. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
  2723. return result;
  2724. } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
  2725. set_frameset_not_ok(parser);
  2726. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
  2727. return true;
  2728. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
  2729. run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
  2730. return true;
  2731. } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
  2732. reconstruct_active_formatting_elements(parser);
  2733. insert_element_from_token(parser, token);
  2734. set_frameset_not_ok(parser);
  2735. GumboInsertionMode state = parser->_parser_state->_insertion_mode;
  2736. if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
  2737. state == GUMBO_INSERTION_MODE_IN_CAPTION ||
  2738. state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
  2739. state == GUMBO_INSERTION_MODE_IN_ROW ||
  2740. state == GUMBO_INSERTION_MODE_IN_CELL) {
  2741. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
  2742. } else {
  2743. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
  2744. }
  2745. return true;
  2746. } else if (tag_in(token, kStartTag,
  2747. (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
  2748. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
  2749. pop_current_node(parser);
  2750. }
  2751. reconstruct_active_formatting_elements(parser);
  2752. insert_element_from_token(parser, token);
  2753. return true;
  2754. } else if (tag_in(token, kStartTag,
  2755. (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) {
  2756. bool success = true;
  2757. GumboTag exception =
  2758. tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)})
  2759. ? GUMBO_TAG_RTC
  2760. : GUMBO_TAG_LAST;
  2761. if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
  2762. generate_implied_end_tags(parser, exception);
  2763. }
  2764. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
  2765. !(exception == GUMBO_TAG_LAST ||
  2766. node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
  2767. parser_add_parse_error(parser, token);
  2768. success = false;
  2769. }
  2770. insert_element_from_token(parser, token);
  2771. return success;
  2772. } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
  2773. parser_add_parse_error(parser, token);
  2774. reconstruct_active_formatting_elements(parser);
  2775. insert_element_of_tag_type(
  2776. parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
  2777. pop_current_node(parser);
  2778. return false;
  2779. } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
  2780. reconstruct_active_formatting_elements(parser);
  2781. adjust_mathml_attributes(parser, token);
  2782. adjust_foreign_attributes(parser, token);
  2783. insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
  2784. if (token->v.start_tag.is_self_closing) {
  2785. pop_current_node(parser);
  2786. acknowledge_self_closing_tag(parser);
  2787. }
  2788. return true;
  2789. } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
  2790. reconstruct_active_formatting_elements(parser);
  2791. adjust_svg_attributes(parser, token);
  2792. adjust_foreign_attributes(parser, token);
  2793. insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
  2794. if (token->v.start_tag.is_self_closing) {
  2795. pop_current_node(parser);
  2796. acknowledge_self_closing_tag(parser);
  2797. }
  2798. return true;
  2799. } else if (tag_in(token, kStartTag,
  2800. (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
  2801. TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT),
  2802. TAG(TH), TAG(THEAD), TAG(TR)})) {
  2803. parser_add_parse_error(parser, token);
  2804. ignore_token(parser);
  2805. return false;
  2806. } else if (token->type == GUMBO_TOKEN_START_TAG) {
  2807. reconstruct_active_formatting_elements(parser);
  2808. insert_element_from_token(parser, token);
  2809. return true;
  2810. } else {
  2811. assert(token->type == GUMBO_TOKEN_END_TAG);
  2812. GumboTag end_tag = token->v.end_tag;
  2813. assert(state->_open_elements.length > 0);
  2814. assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
  2815. // Walk up the stack of open elements until we find one that either:
  2816. // a) Matches the tag name we saw
  2817. // b) Is in the "special" category.
  2818. // If we see a), implicitly close everything up to and including it. If we
  2819. // see b), then record a parse error, don't close anything (except the
  2820. // implied end tags) and ignore the end tag token.
  2821. for (int i = state->_open_elements.length; --i >= 0;) {
  2822. const GumboNode* node = state->_open_elements.data[i];
  2823. if (node_html_tag_is(node, end_tag)) {
  2824. generate_implied_end_tags(parser, end_tag);
  2825. // TODO(jdtang): Do I need to add a parse error here? The condition in
  2826. // the spec seems like it's the inverse of the loop condition above, and
  2827. // so would never fire.
  2828. while (node != pop_current_node(parser))
  2829. ; // Pop everything.
  2830. return true;
  2831. } else if (is_special_node(node)) {
  2832. parser_add_parse_error(parser, token);
  2833. ignore_token(parser);
  2834. return false;
  2835. }
  2836. }
  2837. // <html> is in the special category, so we should never get here.
  2838. assert(0);
  2839. return false;
  2840. }
  2841. }
  2842. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
  2843. static bool handle_text(GumboParser* parser, GumboToken* token) {
  2844. if (token->type == GUMBO_TOKEN_CHARACTER ||
  2845. token->type == GUMBO_TOKEN_WHITESPACE) {
  2846. insert_text_token(parser, token);
  2847. } else {
  2848. // We provide only bare-bones script handling that doesn't involve any of
  2849. // the parser-pause/already-started/script-nesting flags or re-entrant
  2850. // invocations of the tokenizer. Because the intended usage of this library
  2851. // is mostly for templating, refactoring, and static-analysis libraries, we
  2852. // provide the script body as a text-node child of the <script> element.
  2853. // This behavior doesn't support document.write of partial HTML elements,
  2854. // but should be adequate for almost all other scripting support.
  2855. if (token->type == GUMBO_TOKEN_EOF) {
  2856. parser_add_parse_error(parser, token);
  2857. parser->_parser_state->_reprocess_current_token = true;
  2858. }
  2859. pop_current_node(parser);
  2860. set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
  2861. }
  2862. return true;
  2863. }
  2864. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
  2865. static bool handle_in_table(GumboParser* parser, GumboToken* token) {
  2866. GumboParserState* state = parser->_parser_state;
  2867. if (token->type == GUMBO_TOKEN_CHARACTER ||
  2868. token->type == GUMBO_TOKEN_WHITESPACE) {
  2869. // The "pending table character tokens" list described in the spec is
  2870. // nothing more than the TextNodeBufferState. We accumulate text tokens as
  2871. // normal, except that when we go to flush them in the handle_in_table_text,
  2872. // we set _foster_parent_insertions if there're non-whitespace characters in
  2873. // the buffer.
  2874. assert(state->_text_node._buffer.length == 0);
  2875. state->_original_insertion_mode = state->_insertion_mode;
  2876. state->_reprocess_current_token = true;
  2877. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
  2878. return true;
  2879. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  2880. parser_add_parse_error(parser, token);
  2881. ignore_token(parser);
  2882. return false;
  2883. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  2884. append_comment_node(parser, get_current_node(parser), token);
  2885. return true;
  2886. } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
  2887. clear_stack_to_table_context(parser);
  2888. add_formatting_element(parser, &kActiveFormattingScopeMarker);
  2889. insert_element_from_token(parser, token);
  2890. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
  2891. return true;
  2892. } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
  2893. clear_stack_to_table_context(parser);
  2894. insert_element_from_token(parser, token);
  2895. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
  2896. return true;
  2897. } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
  2898. clear_stack_to_table_context(parser);
  2899. insert_element_of_tag_type(
  2900. parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
  2901. parser->_parser_state->_reprocess_current_token = true;
  2902. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
  2903. return true;
  2904. } else if (tag_in(token, kStartTag,
  2905. (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD),
  2906. TAG(TH), TAG(TR)})) {
  2907. clear_stack_to_table_context(parser);
  2908. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  2909. if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) {
  2910. insert_element_of_tag_type(
  2911. parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
  2912. state->_reprocess_current_token = true;
  2913. } else {
  2914. insert_element_from_token(parser, token);
  2915. }
  2916. return true;
  2917. } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
  2918. parser_add_parse_error(parser, token);
  2919. if (close_table(parser)) {
  2920. parser->_parser_state->_reprocess_current_token = true;
  2921. } else {
  2922. ignore_token(parser);
  2923. }
  2924. return false;
  2925. } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
  2926. if (!close_table(parser)) {
  2927. parser_add_parse_error(parser, token);
  2928. return false;
  2929. }
  2930. return true;
  2931. } else if (tag_in(token, kEndTag,
  2932. (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
  2933. TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT),
  2934. TAG(TH), TAG(THEAD), TAG(TR)})) {
  2935. parser_add_parse_error(parser, token);
  2936. ignore_token(parser);
  2937. return false;
  2938. } else if (tag_in(token, kStartTag,
  2939. (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) ||
  2940. (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
  2941. return handle_in_head(parser, token);
  2942. } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
  2943. attribute_matches(
  2944. &token->v.start_tag.attributes, "type", "hidden")) {
  2945. parser_add_parse_error(parser, token);
  2946. insert_element_from_token(parser, token);
  2947. pop_current_node(parser);
  2948. return false;
  2949. } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
  2950. parser_add_parse_error(parser, token);
  2951. if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  2952. ignore_token(parser);
  2953. return false;
  2954. }
  2955. state->_form_element = insert_element_from_token(parser, token);
  2956. pop_current_node(parser);
  2957. return false;
  2958. } else if (token->type == GUMBO_TOKEN_EOF) {
  2959. return handle_in_body(parser, token);
  2960. } else {
  2961. parser_add_parse_error(parser, token);
  2962. state->_foster_parent_insertions = true;
  2963. bool result = handle_in_body(parser, token);
  2964. state->_foster_parent_insertions = false;
  2965. return result;
  2966. }
  2967. }
  2968. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
  2969. static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
  2970. if (token->type == GUMBO_TOKEN_NULL) {
  2971. parser_add_parse_error(parser, token);
  2972. ignore_token(parser);
  2973. return false;
  2974. } else if (token->type == GUMBO_TOKEN_CHARACTER ||
  2975. token->type == GUMBO_TOKEN_WHITESPACE) {
  2976. insert_text_token(parser, token);
  2977. return true;
  2978. } else {
  2979. GumboParserState* state = parser->_parser_state;
  2980. GumboStringBuffer* buffer = &state->_text_node._buffer;
  2981. // Can't use strspn for this because GumboStringBuffers are not
  2982. // null-terminated.
  2983. // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
  2984. // of any one byte that is not whitespace means we flip the flag, so this
  2985. // loop is still valid.
  2986. for (unsigned int i = 0; i < buffer->length; ++i) {
  2987. if (!isspace((unsigned char) buffer->data[i]) ||
  2988. buffer->data[i] == '\v') {
  2989. state->_foster_parent_insertions = true;
  2990. reconstruct_active_formatting_elements(parser);
  2991. break;
  2992. }
  2993. }
  2994. maybe_flush_text_node_buffer(parser);
  2995. state->_foster_parent_insertions = false;
  2996. state->_reprocess_current_token = true;
  2997. state->_insertion_mode = state->_original_insertion_mode;
  2998. return true;
  2999. }
  3000. }
  3001. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
  3002. static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
  3003. if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
  3004. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
  3005. parser_add_parse_error(parser, token);
  3006. ignore_token(parser);
  3007. return false;
  3008. } else {
  3009. generate_implied_end_tags(parser, GUMBO_TAG_LAST);
  3010. bool result = true;
  3011. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
  3012. parser_add_parse_error(parser, token);
  3013. }
  3014. while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
  3015. ;
  3016. clear_active_formatting_elements(parser);
  3017. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3018. return result;
  3019. }
  3020. } else if (tag_in(token, kStartTag,
  3021. (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
  3022. TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
  3023. TAG(TR)}) ||
  3024. (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
  3025. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
  3026. parser_add_parse_error(parser, token);
  3027. ignore_token(parser);
  3028. return false;
  3029. }
  3030. while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
  3031. ;
  3032. clear_active_formatting_elements(parser);
  3033. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3034. parser->_parser_state->_reprocess_current_token = true;
  3035. return true;
  3036. } else if (tag_in(token, kEndTag,
  3037. (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML),
  3038. TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
  3039. TAG(TR)})) {
  3040. parser_add_parse_error(parser, token);
  3041. ignore_token(parser);
  3042. return false;
  3043. } else {
  3044. return handle_in_body(parser, token);
  3045. }
  3046. }
  3047. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
  3048. static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
  3049. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  3050. insert_text_token(parser, token);
  3051. return true;
  3052. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  3053. parser_add_parse_error(parser, token);
  3054. ignore_token(parser);
  3055. return false;
  3056. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  3057. append_comment_node(parser, get_current_node(parser), token);
  3058. return true;
  3059. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3060. return handle_in_body(parser, token);
  3061. } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
  3062. insert_element_from_token(parser, token);
  3063. pop_current_node(parser);
  3064. acknowledge_self_closing_tag(parser);
  3065. return true;
  3066. } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
  3067. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
  3068. parser_add_parse_error(parser, token);
  3069. ignore_token(parser);
  3070. return false;
  3071. }
  3072. pop_current_node(parser);
  3073. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3074. return false;
  3075. } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
  3076. parser_add_parse_error(parser, token);
  3077. ignore_token(parser);
  3078. return false;
  3079. } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
  3080. tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  3081. return handle_in_head(parser, token);
  3082. } else if (token->type == GUMBO_TOKEN_EOF) {
  3083. return handle_in_body(parser, token);
  3084. } else {
  3085. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
  3086. parser_add_parse_error(parser, token);
  3087. ignore_token(parser);
  3088. return false;
  3089. }
  3090. pop_current_node(parser);
  3091. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3092. parser->_parser_state->_reprocess_current_token = true;
  3093. return true;
  3094. }
  3095. }
  3096. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
  3097. static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
  3098. if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
  3099. clear_stack_to_table_body_context(parser);
  3100. insert_element_from_token(parser, token);
  3101. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
  3102. return true;
  3103. } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
  3104. parser_add_parse_error(parser, token);
  3105. clear_stack_to_table_body_context(parser);
  3106. insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
  3107. parser->_parser_state->_reprocess_current_token = true;
  3108. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
  3109. return false;
  3110. } else if (tag_in(token, kEndTag,
  3111. (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
  3112. if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
  3113. parser_add_parse_error(parser, token);
  3114. ignore_token(parser);
  3115. return false;
  3116. }
  3117. clear_stack_to_table_body_context(parser);
  3118. pop_current_node(parser);
  3119. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3120. return true;
  3121. } else if (tag_in(token, kStartTag,
  3122. (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
  3123. TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) ||
  3124. tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
  3125. if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
  3126. has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
  3127. has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
  3128. parser_add_parse_error(parser, token);
  3129. ignore_token(parser);
  3130. return false;
  3131. }
  3132. clear_stack_to_table_body_context(parser);
  3133. pop_current_node(parser);
  3134. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3135. parser->_parser_state->_reprocess_current_token = true;
  3136. return true;
  3137. } else if (tag_in(token, kEndTag,
  3138. (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR),
  3139. TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
  3140. parser_add_parse_error(parser, token);
  3141. ignore_token(parser);
  3142. return false;
  3143. } else {
  3144. return handle_in_table(parser, token);
  3145. }
  3146. }
  3147. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
  3148. static bool handle_in_row(GumboParser* parser, GumboToken* token) {
  3149. if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) {
  3150. clear_stack_to_table_row_context(parser);
  3151. insert_element_from_token(parser, token);
  3152. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
  3153. add_formatting_element(parser, &kActiveFormattingScopeMarker);
  3154. return true;
  3155. } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
  3156. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
  3157. parser_add_parse_error(parser, token);
  3158. ignore_token(parser);
  3159. return false;
  3160. } else {
  3161. clear_stack_to_table_row_context(parser);
  3162. pop_current_node(parser);
  3163. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  3164. return true;
  3165. }
  3166. } else if (tag_in(token, kStartTag,
  3167. (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
  3168. TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) ||
  3169. tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
  3170. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
  3171. parser_add_parse_error(parser, token);
  3172. ignore_token(parser);
  3173. return false;
  3174. } else {
  3175. clear_stack_to_table_row_context(parser);
  3176. pop_current_node(parser);
  3177. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  3178. parser->_parser_state->_reprocess_current_token = true;
  3179. return true;
  3180. }
  3181. } else if (tag_in(token, kEndTag,
  3182. (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
  3183. if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
  3184. (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
  3185. parser_add_parse_error(parser, token);
  3186. ignore_token(parser);
  3187. return false;
  3188. } else {
  3189. clear_stack_to_table_row_context(parser);
  3190. pop_current_node(parser);
  3191. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  3192. parser->_parser_state->_reprocess_current_token = true;
  3193. return true;
  3194. }
  3195. } else if (tag_in(token, kEndTag,
  3196. (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
  3197. TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
  3198. parser_add_parse_error(parser, token);
  3199. ignore_token(parser);
  3200. return false;
  3201. } else {
  3202. return handle_in_table(parser, token);
  3203. }
  3204. }
  3205. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
  3206. static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
  3207. if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
  3208. GumboTag token_tag = token->v.end_tag;
  3209. if (!has_an_element_in_table_scope(parser, token_tag)) {
  3210. parser_add_parse_error(parser, token);
  3211. ignore_token(parser);
  3212. return false;
  3213. }
  3214. return close_table_cell(parser, token, token_tag);
  3215. } else if (tag_in(token, kStartTag,
  3216. (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
  3217. TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
  3218. TAG(TR)})) {
  3219. gumbo_debug("Handling <td> in cell.\n");
  3220. if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
  3221. !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
  3222. gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
  3223. parser_add_parse_error(parser, token);
  3224. ignore_token(parser);
  3225. return false;
  3226. }
  3227. parser->_parser_state->_reprocess_current_token = true;
  3228. return close_current_cell(parser, token);
  3229. } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION),
  3230. TAG(COL), TAG(COLGROUP), TAG(HTML)})) {
  3231. parser_add_parse_error(parser, token);
  3232. ignore_token(parser);
  3233. return false;
  3234. } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
  3235. TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
  3236. if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
  3237. parser_add_parse_error(parser, token);
  3238. ignore_token(parser);
  3239. return false;
  3240. }
  3241. parser->_parser_state->_reprocess_current_token = true;
  3242. return close_current_cell(parser, token);
  3243. } else {
  3244. return handle_in_body(parser, token);
  3245. }
  3246. }
  3247. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
  3248. static bool handle_in_select(GumboParser* parser, GumboToken* token) {
  3249. if (token->type == GUMBO_TOKEN_NULL) {
  3250. parser_add_parse_error(parser, token);
  3251. ignore_token(parser);
  3252. return false;
  3253. } else if (token->type == GUMBO_TOKEN_CHARACTER ||
  3254. token->type == GUMBO_TOKEN_WHITESPACE) {
  3255. insert_text_token(parser, token);
  3256. return true;
  3257. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  3258. parser_add_parse_error(parser, token);
  3259. ignore_token(parser);
  3260. return false;
  3261. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  3262. append_comment_node(parser, get_current_node(parser), token);
  3263. return true;
  3264. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3265. return handle_in_body(parser, token);
  3266. } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
  3267. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
  3268. pop_current_node(parser);
  3269. }
  3270. insert_element_from_token(parser, token);
  3271. return true;
  3272. } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
  3273. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
  3274. pop_current_node(parser);
  3275. }
  3276. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
  3277. pop_current_node(parser);
  3278. }
  3279. insert_element_from_token(parser, token);
  3280. return true;
  3281. } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
  3282. GumboVector* open_elements = &parser->_parser_state->_open_elements;
  3283. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
  3284. node_html_tag_is(open_elements->data[open_elements->length - 2],
  3285. GUMBO_TAG_OPTGROUP)) {
  3286. pop_current_node(parser);
  3287. }
  3288. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
  3289. pop_current_node(parser);
  3290. return true;
  3291. } else {
  3292. parser_add_parse_error(parser, token);
  3293. ignore_token(parser);
  3294. return false;
  3295. }
  3296. } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
  3297. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
  3298. pop_current_node(parser);
  3299. return true;
  3300. } else {
  3301. parser_add_parse_error(parser, token);
  3302. ignore_token(parser);
  3303. return false;
  3304. }
  3305. } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
  3306. if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
  3307. parser_add_parse_error(parser, token);
  3308. ignore_token(parser);
  3309. return false;
  3310. }
  3311. close_current_select(parser);
  3312. return true;
  3313. } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
  3314. parser_add_parse_error(parser, token);
  3315. ignore_token(parser);
  3316. if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
  3317. close_current_select(parser);
  3318. }
  3319. return false;
  3320. } else if (tag_in(token, kStartTag,
  3321. (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
  3322. parser_add_parse_error(parser, token);
  3323. if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
  3324. ignore_token(parser);
  3325. } else {
  3326. close_current_select(parser);
  3327. parser->_parser_state->_reprocess_current_token = true;
  3328. }
  3329. return false;
  3330. } else if (tag_in(token, kStartTag,
  3331. (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
  3332. tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  3333. return handle_in_head(parser, token);
  3334. } else if (token->type == GUMBO_TOKEN_EOF) {
  3335. return handle_in_body(parser, token);
  3336. } else {
  3337. parser_add_parse_error(parser, token);
  3338. ignore_token(parser);
  3339. return false;
  3340. }
  3341. }
  3342. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
  3343. static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
  3344. if (tag_in(token, kStartTag,
  3345. (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
  3346. TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
  3347. parser_add_parse_error(parser, token);
  3348. close_current_select(parser);
  3349. parser->_parser_state->_reprocess_current_token = true;
  3350. return false;
  3351. } else if (tag_in(token, kEndTag,
  3352. (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
  3353. TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
  3354. parser_add_parse_error(parser, token);
  3355. if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
  3356. ignore_token(parser);
  3357. return false;
  3358. } else {
  3359. close_current_select(parser);
  3360. // close_current_select already does the
  3361. // reset_insertion_mode_appropriately
  3362. // reset_insertion_mode_appropriately(parser);
  3363. parser->_parser_state->_reprocess_current_token = true;
  3364. return false;
  3365. }
  3366. } else {
  3367. return handle_in_select(parser, token);
  3368. }
  3369. }
  3370. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
  3371. static bool handle_in_template(GumboParser* parser, GumboToken* token) {
  3372. GumboParserState* state = parser->_parser_state;
  3373. if (token->type == GUMBO_TOKEN_WHITESPACE ||
  3374. token->type == GUMBO_TOKEN_CHARACTER ||
  3375. token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL ||
  3376. token->type == GUMBO_TOKEN_DOCTYPE) {
  3377. return handle_in_body(parser, token);
  3378. } else if (tag_in(token, kStartTag,
  3379. (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
  3380. TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
  3381. TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
  3382. tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
  3383. return handle_in_head(parser, token);
  3384. } else if (tag_in(
  3385. token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP),
  3386. TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
  3387. pop_template_insertion_mode(parser);
  3388. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3389. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
  3390. state->_reprocess_current_token = true;
  3391. return true;
  3392. } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
  3393. pop_template_insertion_mode(parser);
  3394. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
  3395. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
  3396. state->_reprocess_current_token = true;
  3397. return true;
  3398. } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
  3399. pop_template_insertion_mode(parser);
  3400. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  3401. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
  3402. state->_reprocess_current_token = true;
  3403. return true;
  3404. } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
  3405. pop_template_insertion_mode(parser);
  3406. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
  3407. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
  3408. state->_reprocess_current_token = true;
  3409. return true;
  3410. } else if (token->type == GUMBO_TOKEN_START_TAG) {
  3411. pop_template_insertion_mode(parser);
  3412. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  3413. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  3414. state->_reprocess_current_token = true;
  3415. return true;
  3416. } else if (token->type == GUMBO_TOKEN_END_TAG) {
  3417. parser_add_parse_error(parser, token);
  3418. ignore_token(parser);
  3419. return false;
  3420. } else if (token->type == GUMBO_TOKEN_EOF) {
  3421. if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
  3422. // Stop parsing.
  3423. return true;
  3424. }
  3425. parser_add_parse_error(parser, token);
  3426. while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
  3427. ;
  3428. clear_active_formatting_elements(parser);
  3429. pop_template_insertion_mode(parser);
  3430. reset_insertion_mode_appropriately(parser);
  3431. state->_reprocess_current_token = true;
  3432. return false;
  3433. } else {
  3434. assert(0);
  3435. return false;
  3436. }
  3437. }
  3438. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
  3439. static bool handle_after_body(GumboParser* parser, GumboToken* token) {
  3440. if (token->type == GUMBO_TOKEN_WHITESPACE ||
  3441. tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3442. return handle_in_body(parser, token);
  3443. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  3444. GumboNode* html_node = parser->_output->root;
  3445. assert(html_node != NULL);
  3446. append_comment_node(parser, html_node, token);
  3447. return true;
  3448. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  3449. parser_add_parse_error(parser, token);
  3450. ignore_token(parser);
  3451. return false;
  3452. } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
  3453. /* fragment case: ignore the closing HTML token */
  3454. if (is_fragment_parser(parser)) {
  3455. parser_add_parse_error(parser, token);
  3456. ignore_token(parser);
  3457. return false;
  3458. }
  3459. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
  3460. GumboNode* html = parser->_parser_state->_open_elements.data[0];
  3461. assert(node_html_tag_is(html, GUMBO_TAG_HTML));
  3462. record_end_of_element(
  3463. parser->_parser_state->_current_token, &html->v.element);
  3464. return true;
  3465. } else if (token->type == GUMBO_TOKEN_EOF) {
  3466. return true;
  3467. } else {
  3468. parser_add_parse_error(parser, token);
  3469. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  3470. parser->_parser_state->_reprocess_current_token = true;
  3471. return false;
  3472. }
  3473. }
  3474. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
  3475. static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
  3476. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  3477. insert_text_token(parser, token);
  3478. return true;
  3479. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  3480. append_comment_node(parser, get_current_node(parser), token);
  3481. return true;
  3482. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  3483. parser_add_parse_error(parser, token);
  3484. ignore_token(parser);
  3485. return false;
  3486. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3487. return handle_in_body(parser, token);
  3488. } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
  3489. insert_element_from_token(parser, token);
  3490. return true;
  3491. } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
  3492. if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
  3493. parser_add_parse_error(parser, token);
  3494. ignore_token(parser);
  3495. return false;
  3496. }
  3497. pop_current_node(parser);
  3498. if (!is_fragment_parser(parser) &&
  3499. !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
  3500. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
  3501. }
  3502. return true;
  3503. } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
  3504. insert_element_from_token(parser, token);
  3505. pop_current_node(parser);
  3506. acknowledge_self_closing_tag(parser);
  3507. return true;
  3508. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
  3509. return handle_in_head(parser, token);
  3510. } else if (token->type == GUMBO_TOKEN_EOF) {
  3511. if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
  3512. parser_add_parse_error(parser, token);
  3513. return false;
  3514. }
  3515. return true;
  3516. } else {
  3517. parser_add_parse_error(parser, token);
  3518. ignore_token(parser);
  3519. return false;
  3520. }
  3521. }
  3522. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
  3523. static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
  3524. if (token->type == GUMBO_TOKEN_WHITESPACE) {
  3525. insert_text_token(parser, token);
  3526. return true;
  3527. } else if (token->type == GUMBO_TOKEN_COMMENT) {
  3528. append_comment_node(parser, get_current_node(parser), token);
  3529. return true;
  3530. } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
  3531. parser_add_parse_error(parser, token);
  3532. ignore_token(parser);
  3533. return false;
  3534. } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3535. return handle_in_body(parser, token);
  3536. } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
  3537. GumboNode* html = parser->_parser_state->_open_elements.data[0];
  3538. assert(node_html_tag_is(html, GUMBO_TAG_HTML));
  3539. record_end_of_element(
  3540. parser->_parser_state->_current_token, &html->v.element);
  3541. set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
  3542. return true;
  3543. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
  3544. return handle_in_head(parser, token);
  3545. } else if (token->type == GUMBO_TOKEN_EOF) {
  3546. return true;
  3547. } else {
  3548. parser_add_parse_error(parser, token);
  3549. ignore_token(parser);
  3550. return false;
  3551. }
  3552. }
  3553. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
  3554. static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
  3555. if (token->type == GUMBO_TOKEN_COMMENT) {
  3556. append_comment_node(parser, get_document_node(parser), token);
  3557. return true;
  3558. } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
  3559. token->type == GUMBO_TOKEN_WHITESPACE ||
  3560. tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3561. return handle_in_body(parser, token);
  3562. } else if (token->type == GUMBO_TOKEN_EOF) {
  3563. return true;
  3564. } else {
  3565. parser_add_parse_error(parser, token);
  3566. set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
  3567. parser->_parser_state->_reprocess_current_token = true;
  3568. return false;
  3569. }
  3570. }
  3571. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
  3572. static bool handle_after_after_frameset(
  3573. GumboParser* parser, GumboToken* token) {
  3574. if (token->type == GUMBO_TOKEN_COMMENT) {
  3575. append_comment_node(parser, get_document_node(parser), token);
  3576. return true;
  3577. } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
  3578. token->type == GUMBO_TOKEN_WHITESPACE ||
  3579. tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
  3580. return handle_in_body(parser, token);
  3581. } else if (token->type == GUMBO_TOKEN_EOF) {
  3582. return true;
  3583. } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
  3584. return handle_in_head(parser, token);
  3585. } else {
  3586. parser_add_parse_error(parser, token);
  3587. ignore_token(parser);
  3588. return false;
  3589. }
  3590. }
  3591. // Function pointers for each insertion mode. Keep in sync with
  3592. // insertion_mode.h.
  3593. typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
  3594. static const TokenHandler kTokenHandlers[] = {handle_initial,
  3595. handle_before_html, handle_before_head, handle_in_head,
  3596. handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
  3597. handle_in_table, handle_in_table_text, handle_in_caption,
  3598. handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
  3599. handle_in_select, handle_in_select_in_table, handle_in_template,
  3600. handle_after_body, handle_in_frameset, handle_after_frameset,
  3601. handle_after_after_body, handle_after_after_frameset};
  3602. static bool handle_html_content(GumboParser* parser, GumboToken* token) {
  3603. return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
  3604. parser, token);
  3605. }
  3606. // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
  3607. static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
  3608. gumbo_debug("Handling foreign content");
  3609. switch (token->type) {
  3610. case GUMBO_TOKEN_NULL:
  3611. parser_add_parse_error(parser, token);
  3612. token->v.character = kUtf8ReplacementChar;
  3613. insert_text_token(parser, token);
  3614. return false;
  3615. case GUMBO_TOKEN_WHITESPACE:
  3616. insert_text_token(parser, token);
  3617. return true;
  3618. case GUMBO_TOKEN_CDATA:
  3619. case GUMBO_TOKEN_CHARACTER:
  3620. insert_text_token(parser, token);
  3621. set_frameset_not_ok(parser);
  3622. return true;
  3623. case GUMBO_TOKEN_COMMENT:
  3624. append_comment_node(parser, get_current_node(parser), token);
  3625. return true;
  3626. case GUMBO_TOKEN_DOCTYPE:
  3627. parser_add_parse_error(parser, token);
  3628. ignore_token(parser);
  3629. return false;
  3630. default:
  3631. // Fall through to the if-statements below.
  3632. break;
  3633. }
  3634. // Order matters for these clauses.
  3635. if (tag_in(token, kStartTag,
  3636. (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
  3637. TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
  3638. TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
  3639. TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
  3640. TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
  3641. TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
  3642. TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
  3643. TAG(UL), TAG(VAR)}) ||
  3644. (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
  3645. (token_has_attribute(token, "color") ||
  3646. token_has_attribute(token, "face") ||
  3647. token_has_attribute(token, "size")))) {
  3648. /* Parse error */
  3649. parser_add_parse_error(parser, token);
  3650. /*
  3651. * Fragment case: If the parser was originally created for the HTML
  3652. * fragment parsing algorithm, then act as described in the "any other
  3653. * start tag" entry below.
  3654. */
  3655. if (!is_fragment_parser(parser)) {
  3656. do {
  3657. pop_current_node(parser);
  3658. } while (!(is_mathml_integration_point(get_current_node(parser)) ||
  3659. is_html_integration_point(get_current_node(parser)) ||
  3660. get_current_node(parser)->v.element.tag_namespace ==
  3661. GUMBO_NAMESPACE_HTML));
  3662. parser->_parser_state->_reprocess_current_token = true;
  3663. return false;
  3664. }
  3665. assert(token->type == GUMBO_TOKEN_START_TAG);
  3666. }
  3667. if (token->type == GUMBO_TOKEN_START_TAG) {
  3668. const GumboNamespaceEnum current_namespace =
  3669. get_adjusted_current_node(parser)->v.element.tag_namespace;
  3670. if (current_namespace == GUMBO_NAMESPACE_MATHML) {
  3671. adjust_mathml_attributes(parser, token);
  3672. }
  3673. if (current_namespace == GUMBO_NAMESPACE_SVG) {
  3674. // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
  3675. // function.
  3676. adjust_svg_attributes(parser, token);
  3677. }
  3678. adjust_foreign_attributes(parser, token);
  3679. insert_foreign_element(parser, token, current_namespace);
  3680. if (token->v.start_tag.is_self_closing) {
  3681. pop_current_node(parser);
  3682. acknowledge_self_closing_tag(parser);
  3683. }
  3684. return true;
  3685. // </script> tags are handled like any other end tag, putting the script's
  3686. // text into a text node child and closing the current node.
  3687. } else {
  3688. assert(token->type == GUMBO_TOKEN_END_TAG);
  3689. GumboNode* node = get_current_node(parser);
  3690. assert(node != NULL);
  3691. GumboStringPiece token_tagname = token->original_text;
  3692. GumboStringPiece node_tagname = node->v.element.original_tag;
  3693. gumbo_tag_from_original_text(&token_tagname);
  3694. gumbo_tag_from_original_text(&node_tagname);
  3695. bool is_success = true;
  3696. if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
  3697. parser_add_parse_error(parser, token);
  3698. is_success = false;
  3699. }
  3700. int i = parser->_parser_state->_open_elements.length;
  3701. for (--i; i > 0;) {
  3702. // Here we move up the stack until we find an HTML element (in which
  3703. // case we do nothing) or we find the element that we're about to
  3704. // close (in which case we pop everything we've seen until that
  3705. // point.)
  3706. gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
  3707. node_tagname.data, i);
  3708. if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
  3709. gumbo_debug("Matches.\n");
  3710. while (pop_current_node(parser) != node) {
  3711. // Pop all the nodes below the current one. Node is guaranteed to
  3712. // be an element on the stack of open elements (set below), so
  3713. // this loop is guaranteed to terminate.
  3714. }
  3715. return is_success;
  3716. }
  3717. --i;
  3718. node = parser->_parser_state->_open_elements.data[i];
  3719. if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
  3720. // Must break before gumbo_tag_from_original_text to avoid passing
  3721. // parser-inserted nodes through.
  3722. break;
  3723. }
  3724. node_tagname = node->v.element.original_tag;
  3725. gumbo_tag_from_original_text(&node_tagname);
  3726. }
  3727. assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
  3728. // We can't call handle_token directly because the current node is still in
  3729. // the SVG namespace, so it would re-enter this and result in infinite
  3730. // recursion.
  3731. return handle_html_content(parser, token) && is_success;
  3732. }
  3733. }
  3734. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
  3735. static bool handle_token(GumboParser* parser, GumboToken* token) {
  3736. if (parser->_parser_state->_ignore_next_linefeed &&
  3737. token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
  3738. parser->_parser_state->_ignore_next_linefeed = false;
  3739. ignore_token(parser);
  3740. return true;
  3741. }
  3742. // This needs to be reset both here and in the conditional above to catch both
  3743. // the case where the next token is not whitespace (so we don't ignore
  3744. // whitespace in the middle of <pre> tags) and where there are multiple
  3745. // whitespace tokens (so we don't ignore the second one).
  3746. parser->_parser_state->_ignore_next_linefeed = false;
  3747. if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
  3748. parser->_parser_state->_closed_body_tag = true;
  3749. }
  3750. if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
  3751. parser->_parser_state->_closed_html_tag = true;
  3752. }
  3753. const GumboNode* current_node = get_adjusted_current_node(parser);
  3754. assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT ||
  3755. current_node->type == GUMBO_NODE_TEMPLATE);
  3756. if (current_node) {
  3757. gumbo_debug("Current node: <%s>.\n",
  3758. gumbo_normalized_tagname(current_node->v.element.tag));
  3759. }
  3760. if (!current_node ||
  3761. current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
  3762. (is_mathml_integration_point(current_node) &&
  3763. (token->type == GUMBO_TOKEN_CHARACTER ||
  3764. token->type == GUMBO_TOKEN_WHITESPACE ||
  3765. token->type == GUMBO_TOKEN_NULL ||
  3766. (token->type == GUMBO_TOKEN_START_TAG &&
  3767. !tag_in(token, kStartTag,
  3768. (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
  3769. (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
  3770. node_qualified_tag_is(
  3771. current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
  3772. tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
  3773. (is_html_integration_point(current_node) &&
  3774. (token->type == GUMBO_TOKEN_START_TAG ||
  3775. token->type == GUMBO_TOKEN_CHARACTER ||
  3776. token->type == GUMBO_TOKEN_NULL ||
  3777. token->type == GUMBO_TOKEN_WHITESPACE)) ||
  3778. token->type == GUMBO_TOKEN_EOF) {
  3779. return handle_html_content(parser, token);
  3780. } else {
  3781. return handle_in_foreign_content(parser, token);
  3782. }
  3783. }
  3784. static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx,
  3785. GumboNamespaceEnum fragment_namespace) {
  3786. GumboNode* root;
  3787. assert(fragment_ctx != GUMBO_TAG_LAST);
  3788. // 3
  3789. parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
  3790. parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
  3791. fragment_namespace;
  3792. // 4
  3793. if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
  3794. // Non-HTML namespaces always start in the DATA state.
  3795. switch (fragment_ctx) {
  3796. case GUMBO_TAG_TITLE:
  3797. case GUMBO_TAG_TEXTAREA:
  3798. gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
  3799. break;
  3800. case GUMBO_TAG_STYLE:
  3801. case GUMBO_TAG_XMP:
  3802. case GUMBO_TAG_IFRAME:
  3803. case GUMBO_TAG_NOEMBED:
  3804. case GUMBO_TAG_NOFRAMES:
  3805. gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
  3806. break;
  3807. case GUMBO_TAG_SCRIPT:
  3808. gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
  3809. break;
  3810. case GUMBO_TAG_NOSCRIPT:
  3811. /* scripting is disabled in Gumbo, so leave the tokenizer
  3812. * in the default data state */
  3813. break;
  3814. case GUMBO_TAG_PLAINTEXT:
  3815. gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
  3816. break;
  3817. default:
  3818. /* default data state */
  3819. break;
  3820. }
  3821. }
  3822. // 5. 6. 7.
  3823. root = insert_element_of_tag_type(
  3824. parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
  3825. parser->_output->root = root;
  3826. // 8.
  3827. if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
  3828. push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
  3829. }
  3830. // 10.
  3831. reset_insertion_mode_appropriately(parser);
  3832. }
  3833. GumboOutput* gumbo_parse(const char* buffer) {
  3834. return gumbo_parse_with_options(
  3835. &kGumboDefaultOptions, buffer, strlen(buffer));
  3836. }
  3837. GumboOutput* gumbo_parse_with_options(
  3838. const GumboOptions* options, const char* buffer, size_t length) {
  3839. GumboParser parser;
  3840. parser._options = options;
  3841. output_init(&parser);
  3842. gumbo_tokenizer_state_init(&parser, buffer, length);
  3843. parser_state_init(&parser);
  3844. if (options->fragment_context != GUMBO_TAG_LAST) {
  3845. fragment_parser_init(
  3846. &parser, options->fragment_context, options->fragment_namespace);
  3847. }
  3848. GumboParserState* state = parser._parser_state;
  3849. gumbo_debug("Parsing %.*s.\n", length, buffer);
  3850. // Sanity check so that infinite loops die with an assertion failure instead
  3851. // of hanging the process before we ever get an error.
  3852. int loop_count = 0;
  3853. GumboToken token;
  3854. bool has_error = false;
  3855. do {
  3856. if (state->_reprocess_current_token) {
  3857. state->_reprocess_current_token = false;
  3858. } else {
  3859. GumboNode* current_node = get_current_node(&parser);
  3860. gumbo_tokenizer_set_is_current_node_foreign(&parser,
  3861. current_node &&
  3862. current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
  3863. has_error = !gumbo_lex(&parser, &token) || has_error;
  3864. }
  3865. const char* token_type = "text";
  3866. switch (token.type) {
  3867. case GUMBO_TOKEN_DOCTYPE:
  3868. token_type = "doctype";
  3869. break;
  3870. case GUMBO_TOKEN_START_TAG:
  3871. token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
  3872. break;
  3873. case GUMBO_TOKEN_END_TAG:
  3874. token_type = gumbo_normalized_tagname(token.v.end_tag);
  3875. break;
  3876. case GUMBO_TOKEN_COMMENT:
  3877. token_type = "comment";
  3878. break;
  3879. default:
  3880. break;
  3881. }
  3882. gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type,
  3883. token.position.line, token.position.column, state->_insertion_mode);
  3884. state->_current_token = &token;
  3885. state->_self_closing_flag_acknowledged =
  3886. !(token.type == GUMBO_TOKEN_START_TAG &&
  3887. token.v.start_tag.is_self_closing);
  3888. has_error = !handle_token(&parser, &token) || has_error;
  3889. // Check for memory leaks when ownership is transferred from start tag
  3890. // tokens to nodes.
  3891. assert(state->_reprocess_current_token ||
  3892. token.type != GUMBO_TOKEN_START_TAG ||
  3893. token.v.start_tag.attributes.data == NULL);
  3894. if (!state->_self_closing_flag_acknowledged) {
  3895. GumboError* error = parser_add_parse_error(&parser, &token);
  3896. if (error) {
  3897. error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
  3898. }
  3899. }
  3900. ++loop_count;
  3901. assert(loop_count < 1000000000);
  3902. } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
  3903. !(options->stop_on_first_error && has_error));
  3904. finish_parsing(&parser);
  3905. // For API uniformity reasons, if the doctype still has nulls, convert them to
  3906. // empty strings.
  3907. GumboDocument* doc_type = &parser._output->document->v.document;
  3908. if (doc_type->name == NULL) {
  3909. doc_type->name = gumbo_copy_stringz(&parser, "");
  3910. }
  3911. if (doc_type->public_identifier == NULL) {
  3912. doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
  3913. }
  3914. if (doc_type->system_identifier == NULL) {
  3915. doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
  3916. }
  3917. parser_state_destroy(&parser);
  3918. gumbo_tokenizer_state_destroy(&parser);
  3919. return parser._output;
  3920. }
  3921. void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
  3922. // Need a dummy GumboParser because the allocator comes along with the
  3923. // options object.
  3924. GumboParser parser;
  3925. parser._options = options;
  3926. destroy_node(&parser, node);
  3927. }
  3928. void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
  3929. // Need a dummy GumboParser because the allocator comes along with the
  3930. // options object.
  3931. GumboParser parser;
  3932. parser._options = options;
  3933. destroy_node(&parser, output->document);
  3934. for (unsigned int i = 0; i < output->errors.length; ++i) {
  3935. gumbo_error_destroy(&parser, output->errors.data[i]);
  3936. }
  3937. gumbo_vector_destroy(&parser, &output->errors);
  3938. gumbo_parser_deallocate(&parser, output);
  3939. }