gumbo.h 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. // Copyright 2010 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: [email protected] (Jonathan Tang)
  16. //
  17. // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
  18. // GUMBO_ as a prefix for enum constants (static constants get the Google-style
  19. // kGumbo prefix).
  20. /**
  21. * @file
  22. * @mainpage Gumbo HTML Parser
  23. *
  24. * This provides a conformant, no-dependencies implementation of the HTML5
  25. * parsing algorithm. It supports only UTF8; if you need to parse a different
  26. * encoding, run a preprocessing step to convert to UTF8. It returns a parse
  27. * tree made of the structs in this file.
  28. *
  29. * Example:
  30. * @code
  31. * GumboOutput* output = gumbo_parse(input);
  32. * do_something_with_doctype(output->document);
  33. * do_something_with_html_tree(output->root);
  34. * gumbo_destroy_output(&options, output);
  35. * @endcode
  36. * HTML5 Spec:
  37. *
  38. * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
  39. */
  40. #ifndef GUMBO_GUMBO_H_
  41. #define GUMBO_GUMBO_H_
  42. #ifdef _MSC_VER
  43. #define _CRT_SECURE_NO_WARNINGS
  44. #define fileno _fileno
  45. #endif
  46. #include <stdbool.h>
  47. #include <stddef.h>
  48. #ifdef __cplusplus
  49. extern "C" {
  50. #endif
  51. /**
  52. * A struct representing a character position within the original text buffer.
  53. * Line and column numbers are 1-based and offsets are 0-based, which matches
  54. * how most editors and command-line tools work. Also, columns measure
  55. * positions in terms of characters while offsets measure by bytes; this is
  56. * because the offset field is often used to pull out a particular region of
  57. * text (which in most languages that bind to C implies pointer arithmetic on a
  58. * buffer of bytes), while the column field is often used to reference a
  59. * particular column on a printable display, which nowadays is usually UTF-8.
  60. */
  61. typedef struct {
  62. unsigned int line;
  63. unsigned int column;
  64. unsigned int offset;
  65. } GumboSourcePosition;
  66. /**
  67. * A SourcePosition used for elements that have no source position, i.e.
  68. * parser-inserted elements.
  69. */
  70. extern const GumboSourcePosition kGumboEmptySourcePosition;
  71. /**
  72. * A struct representing a string or part of a string. Strings within the
  73. * parser are represented by a char* and a length; the char* points into
  74. * an existing data buffer owned by some other code (often the original input).
  75. * GumboStringPieces are assumed (by convention) to be immutable, because they
  76. * may share data. Use GumboStringBuffer if you need to construct a string.
  77. * Clients should assume that it is not NUL-terminated, and should always use
  78. * explicit lengths when manipulating them.
  79. */
  80. typedef struct {
  81. /** A pointer to the beginning of the string. NULL iff length == 0. */
  82. const char* data;
  83. /** The length of the string fragment, in bytes. May be zero. */
  84. size_t length;
  85. } GumboStringPiece;
  86. /** A constant to represent a 0-length null string. */
  87. extern const GumboStringPiece kGumboEmptyString;
  88. /**
  89. * Compares two GumboStringPieces, and returns true if they're equal or false
  90. * otherwise.
  91. */
  92. bool gumbo_string_equals(
  93. const GumboStringPiece* str1, const GumboStringPiece* str2);
  94. /**
  95. * Compares two GumboStringPieces ignoring case, and returns true if they're
  96. * equal or false otherwise.
  97. */
  98. bool gumbo_string_equals_ignore_case(
  99. const GumboStringPiece* str1, const GumboStringPiece* str2);
  100. /**
  101. * A simple vector implementation. This stores a pointer to a data array and a
  102. * length. All elements are stored as void*; client code must cast to the
  103. * appropriate type. Overflows upon addition result in reallocation of the data
  104. * array, with the size doubling to maintain O(1) amortized cost. There is no
  105. * removal function, as this isn't needed for any of the operations within this
  106. * library. Iteration can be done through inspecting the structure directly in
  107. * a for-loop.
  108. */
  109. typedef struct {
  110. /** Data elements. This points to a dynamically-allocated array of capacity
  111. * elements, each a void* to the element itself.
  112. */
  113. void** data;
  114. /** Number of elements currently in the vector. */
  115. unsigned int length;
  116. /** Current array capacity. */
  117. unsigned int capacity;
  118. } GumboVector;
  119. /** An empty (0-length, 0-capacity) GumboVector. */
  120. extern const GumboVector kGumboEmptyVector;
  121. /**
  122. * Returns the first index at which an element appears in this vector (testing
  123. * by pointer equality), or -1 if it never does.
  124. */
  125. int gumbo_vector_index_of(GumboVector* vector, const void* element);
  126. /**
  127. * An enum for all the tags defined in the HTML5 standard. These correspond to
  128. * the tag names themselves. Enum constants exist only for tags which appear in
  129. * the spec itself (or for tags with special handling in the SVG and MathML
  130. * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
  131. * name can be obtained through original_tag.
  132. *
  133. * This is mostly for API convenience, so that clients of this library don't
  134. * need to perform a strcasecmp to find the normalized tag name. It also has
  135. * efficiency benefits, by letting the parser work with enums instead of
  136. * strings.
  137. */
  138. typedef enum {
  139. // Load all the tags from an external source, generated from tag.in.
  140. #include "tag_enum.h"
  141. // Used for all tags that don't have special handling in HTML. Add new tags
  142. // to the end of tag.in so as to preserve backwards-compatibility.
  143. GUMBO_TAG_UNKNOWN,
  144. // A marker value to indicate the end of the enum, for iterating over it.
  145. // Also used as the terminator for varargs functions that take tags.
  146. GUMBO_TAG_LAST,
  147. } GumboTag;
  148. /**
  149. * Returns the normalized (usually all-lowercased, except for foreign content)
  150. * tag name for an GumboTag enum. Return value is static data owned by the
  151. * library.
  152. */
  153. const char* gumbo_normalized_tagname(GumboTag tag);
  154. /**
  155. * Extracts the tag name from the original_text field of an element or token by
  156. * stripping off </> characters and attributes and adjusting the passed-in
  157. * GumboStringPiece appropriately. The tag name is in the original case and
  158. * shares a buffer with the original text, to simplify memory management.
  159. * Behavior is undefined if a string-piece that doesn't represent an HTML tag
  160. * (<tagname> or </tagname>) is passed in. If the string piece is completely
  161. * empty (NULL data pointer), then this function will exit successfully as a
  162. * no-op.
  163. */
  164. void gumbo_tag_from_original_text(GumboStringPiece* text);
  165. /**
  166. * Fixes the case of SVG elements that are not all lowercase.
  167. * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
  168. * This is not done at parse time because there's no place to store a mutated
  169. * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
  170. * without special handling), while original_tag_name is a pointer into the
  171. * original buffer. Instead, we provide this helper function that clients can
  172. * use to rename SVG tags as appropriate.
  173. * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
  174. * no normalization is called for. The return value is static data and owned by
  175. * the library.
  176. */
  177. const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
  178. /**
  179. * Converts a tag name string (which may be in upper or mixed case) to a tag
  180. * enum. The `tag` version expects `tagname` to be NULL-terminated
  181. */
  182. GumboTag gumbo_tag_enum(const char* tagname);
  183. GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
  184. /**
  185. * Attribute namespaces.
  186. * HTML includes special handling for XLink, XML, and XMLNS namespaces on
  187. * attributes. Everything else goes in the generic "NONE" namespace.
  188. */
  189. typedef enum {
  190. GUMBO_ATTR_NAMESPACE_NONE,
  191. GUMBO_ATTR_NAMESPACE_XLINK,
  192. GUMBO_ATTR_NAMESPACE_XML,
  193. GUMBO_ATTR_NAMESPACE_XMLNS,
  194. } GumboAttributeNamespaceEnum;
  195. /**
  196. * A struct representing a single attribute on an HTML tag. This is a
  197. * name-value pair, but also includes information about source locations and
  198. * original source text.
  199. */
  200. typedef struct {
  201. /**
  202. * The namespace for the attribute. This will usually be
  203. * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
  204. * values, per:
  205. * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
  206. */
  207. GumboAttributeNamespaceEnum attr_namespace;
  208. /**
  209. * The name of the attribute. This is in a freshly-allocated buffer to deal
  210. * with case-normalization, and is null-terminated.
  211. */
  212. const char* name;
  213. /**
  214. * The original text of the attribute name, as a pointer into the original
  215. * source buffer.
  216. */
  217. GumboStringPiece original_name;
  218. /**
  219. * The value of the attribute. This is in a freshly-allocated buffer to deal
  220. * with unescaping, and is null-terminated. It does not include any quotes
  221. * that surround the attribute. If the attribute has no value (for example,
  222. * 'selected' on a checkbox), this will be an empty string.
  223. */
  224. const char* value;
  225. /**
  226. * The original text of the value of the attribute. This points into the
  227. * original source buffer. It includes any quotes that surround the
  228. * attribute, and you can look at original_value.data[0] and
  229. * original_value.data[original_value.length - 1] to determine what the quote
  230. * characters were. If the attribute has no value, this will be a 0-length
  231. * string.
  232. */
  233. GumboStringPiece original_value;
  234. /** The starting position of the attribute name. */
  235. GumboSourcePosition name_start;
  236. /**
  237. * The ending position of the attribute name. This is not always derivable
  238. * from the starting position of the value because of the possibility of
  239. * whitespace around the = sign.
  240. */
  241. GumboSourcePosition name_end;
  242. /** The starting position of the attribute value. */
  243. GumboSourcePosition value_start;
  244. /** The ending position of the attribute value. */
  245. GumboSourcePosition value_end;
  246. } GumboAttribute;
  247. /**
  248. * Given a vector of GumboAttributes, look up the one with the specified name
  249. * and return it, or NULL if no such attribute exists. This uses a
  250. * case-insensitive match, as HTML is case-insensitive.
  251. */
  252. GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
  253. /**
  254. * Enum denoting the type of node. This determines the type of the node.v
  255. * union.
  256. */
  257. typedef enum {
  258. /** Document node. v will be a GumboDocument. */
  259. GUMBO_NODE_DOCUMENT,
  260. /** Element node. v will be a GumboElement. */
  261. GUMBO_NODE_ELEMENT,
  262. /** Text node. v will be a GumboText. */
  263. GUMBO_NODE_TEXT,
  264. /** CDATA node. v will be a GumboText. */
  265. GUMBO_NODE_CDATA,
  266. /** Comment node. v will be a GumboText, excluding comment delimiters. */
  267. GUMBO_NODE_COMMENT,
  268. /** Text node, where all contents is whitespace. v will be a GumboText. */
  269. GUMBO_NODE_WHITESPACE,
  270. /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
  271. * client libraries will want to ignore the contents of template nodes, as
  272. * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
  273. * here, while clients that want to include template contents should also
  274. * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
  275. GUMBO_NODE_TEMPLATE
  276. } GumboNodeType;
  277. /**
  278. * Forward declaration of GumboNode so it can be used recursively in
  279. * GumboNode.parent.
  280. */
  281. typedef struct GumboInternalNode GumboNode;
  282. /**
  283. * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
  284. */
  285. typedef enum {
  286. GUMBO_DOCTYPE_NO_QUIRKS,
  287. GUMBO_DOCTYPE_QUIRKS,
  288. GUMBO_DOCTYPE_LIMITED_QUIRKS
  289. } GumboQuirksModeEnum;
  290. /**
  291. * Namespaces.
  292. * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
  293. * anything inside an <svg> tag is in the SVG namespace, anything inside the
  294. * <math> tag is in the MathML namespace, and anything else is inside the HTML
  295. * namespace. No other namespaces are supported, so this can be an enum only.
  296. */
  297. typedef enum {
  298. GUMBO_NAMESPACE_HTML,
  299. GUMBO_NAMESPACE_SVG,
  300. GUMBO_NAMESPACE_MATHML
  301. } GumboNamespaceEnum;
  302. /**
  303. * Parse flags.
  304. * We track the reasons for parser insertion of nodes and store them in a
  305. * bitvector in the node itself. This lets client code optimize out nodes that
  306. * are implied by the HTML structure of the document, or flag constructs that
  307. * may not be allowed by a style guide, or track the prevalence of incorrect or
  308. * tricky HTML code.
  309. */
  310. typedef enum {
  311. /**
  312. * A normal node - both start and end tags appear in the source, nothing has
  313. * been reparented.
  314. */
  315. GUMBO_INSERTION_NORMAL = 0,
  316. /**
  317. * A node inserted by the parser to fulfill some implicit insertion rule.
  318. * This is usually set in addition to some other flag giving a more specific
  319. * insertion reason; it's a generic catch-all term meaning "The start tag for
  320. * this node did not appear in the document source".
  321. */
  322. GUMBO_INSERTION_BY_PARSER = 1 << 0,
  323. /**
  324. * A flag indicating that the end tag for this node did not appear in the
  325. * document source. Note that in some cases, you can still have
  326. * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
  327. * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
  328. * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
  329. * exists. This flag will be set only if the end tag is completely missing;
  330. * in some cases, the end tag may be misplaced (eg. a </body> tag with text
  331. * afterwards), which will leave this flag unset and require clients to
  332. * inspect the parse errors for that case.
  333. */
  334. GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
  335. // Value 1 << 2 was for a flag that has since been removed.
  336. /**
  337. * A flag for nodes that are inserted because their presence is implied by
  338. * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
  339. */
  340. GUMBO_INSERTION_IMPLIED = 1 << 3,
  341. /**
  342. * A flag for nodes that are converted from their end tag equivalents. For
  343. * example, </p> when no paragraph is open implies that the parser should
  344. * create a <p> tag and immediately close it, while </br> means the same thing
  345. * as <br>.
  346. */
  347. GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
  348. /** A flag for nodes that are converted from the parse of an <isindex> tag. */
  349. GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
  350. /** A flag for <image> tags that are rewritten as <img>. */
  351. GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
  352. /**
  353. * A flag for nodes that are cloned as a result of the reconstruction of
  354. * active formatting elements. This is set only on the clone; the initial
  355. * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
  356. */
  357. GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
  358. /** A flag for nodes that are cloned by the adoption agency algorithm. */
  359. GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
  360. /** A flag for nodes that are moved by the adoption agency algorithm. */
  361. GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
  362. /**
  363. * A flag for nodes that have been foster-parented out of a table (or
  364. * should've been foster-parented, if verbatim mode is set).
  365. */
  366. GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
  367. } GumboParseFlags;
  368. /**
  369. * Information specific to document nodes.
  370. */
  371. typedef struct {
  372. /**
  373. * An array of GumboNodes, containing the children of this element. This will
  374. * normally consist of the <html> element and any comment nodes found.
  375. * Pointers are owned.
  376. */
  377. GumboVector /* GumboNode* */ children;
  378. // True if there was an explicit doctype token as opposed to it being omitted.
  379. bool has_doctype;
  380. // Fields from the doctype token, copied verbatim.
  381. const char* name;
  382. const char* public_identifier;
  383. const char* system_identifier;
  384. /**
  385. * Whether or not the document is in QuirksMode, as determined by the values
  386. * in the GumboTokenDocType template.
  387. */
  388. GumboQuirksModeEnum doc_type_quirks_mode;
  389. } GumboDocument;
  390. /**
  391. * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
  392. * This contains just a block of text and its position.
  393. */
  394. typedef struct {
  395. /**
  396. * The text of this node, after entities have been parsed and decoded. For
  397. * comment/cdata nodes, this does not include the comment delimiters.
  398. */
  399. const char* text;
  400. /**
  401. * The original text of this node, as a pointer into the original buffer. For
  402. * comment/cdata nodes, this includes the comment delimiters.
  403. */
  404. GumboStringPiece original_text;
  405. /**
  406. * The starting position of this node. This corresponds to the position of
  407. * original_text, before entities are decoded.
  408. * */
  409. GumboSourcePosition start_pos;
  410. } GumboText;
  411. /**
  412. * The struct used to represent all HTML elements. This contains information
  413. * about the tag, attributes, and child nodes.
  414. */
  415. typedef struct {
  416. /**
  417. * An array of GumboNodes, containing the children of this element. Pointers
  418. * are owned.
  419. */
  420. GumboVector /* GumboNode* */ children;
  421. /** The GumboTag enum for this element. */
  422. GumboTag tag;
  423. /** The GumboNamespaceEnum for this element. */
  424. GumboNamespaceEnum tag_namespace;
  425. /**
  426. * A GumboStringPiece pointing to the original tag text for this element,
  427. * pointing directly into the source buffer. If the tag was inserted
  428. * algorithmically (for example, <head> or <tbody> insertion), this will be a
  429. * zero-length string.
  430. */
  431. GumboStringPiece original_tag;
  432. /**
  433. * A GumboStringPiece pointing to the original end tag text for this element.
  434. * If the end tag was inserted algorithmically, (for example, closing a
  435. * self-closing tag), this will be a zero-length string.
  436. */
  437. GumboStringPiece original_end_tag;
  438. /** The source position for the start of the start tag. */
  439. GumboSourcePosition start_pos;
  440. /** The source position for the start of the end tag. */
  441. GumboSourcePosition end_pos;
  442. /**
  443. * An array of GumboAttributes, containing the attributes for this tag in the
  444. * order that they were parsed. Pointers are owned.
  445. */
  446. GumboVector /* GumboAttribute* */ attributes;
  447. } GumboElement;
  448. /**
  449. * A supertype for GumboElement and GumboText, so that we can include one
  450. * generic type in lists of children and cast as necessary to subtypes.
  451. */
  452. struct GumboInternalNode {
  453. /** The type of node that this is. */
  454. GumboNodeType type;
  455. /** Pointer back to parent node. Not owned. */
  456. GumboNode* parent;
  457. /** The index within the parent's children vector of this node. */
  458. size_t index_within_parent;
  459. /**
  460. * A bitvector of flags containing information about why this element was
  461. * inserted into the parse tree, including a variety of special parse
  462. * situations.
  463. */
  464. GumboParseFlags parse_flags;
  465. /** The actual node data. */
  466. union {
  467. GumboDocument document; // For GUMBO_NODE_DOCUMENT.
  468. GumboElement element; // For GUMBO_NODE_ELEMENT.
  469. GumboText text; // For everything else.
  470. } v;
  471. };
  472. /**
  473. * The type for an allocator function. Takes the 'userdata' member of the
  474. * GumboParser struct as its first argument. Semantics should be the same as
  475. * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
  476. * Allocating a block of 0 bytes behaves as per malloc.
  477. */
  478. // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
  479. typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
  480. /**
  481. * The type for a deallocator function. Takes the 'userdata' member of the
  482. * GumboParser struct as its first argument.
  483. */
  484. typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
  485. /**
  486. * Input struct containing configuration options for the parser.
  487. * These let you specify alternate memory managers, provide different error
  488. * handling, etc.
  489. * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
  490. */
  491. typedef struct GumboInternalOptions {
  492. /** A memory allocator function. Default: malloc. */
  493. GumboAllocatorFunction allocator;
  494. /** A memory deallocator function. Default: free. */
  495. GumboDeallocatorFunction deallocator;
  496. /**
  497. * An opaque object that's passed in as the first argument to all callbacks
  498. * used by this library. Default: NULL.
  499. */
  500. void* userdata;
  501. /**
  502. * The tab-stop size, for computing positions in source code that uses tabs.
  503. * Default: 8.
  504. */
  505. int tab_stop;
  506. /**
  507. * Whether or not to stop parsing when the first error is encountered.
  508. * Default: false.
  509. */
  510. bool stop_on_first_error;
  511. /**
  512. * The maximum number of errors before the parser stops recording them. This
  513. * is provided so that if the page is totally borked, we don't completely fill
  514. * up the errors vector and exhaust memory with useless redundant errors. Set
  515. * to -1 to disable the limit.
  516. * Default: -1
  517. */
  518. int max_errors;
  519. /**
  520. * The fragment context for parsing:
  521. * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
  522. *
  523. * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
  524. * the regular parsing algorithm. Otherwise, pass the tag enum for the
  525. * intended parent of the parsed fragment. We use just the tag enum rather
  526. * than a full node because that's enough to set all the parsing context we
  527. * need, and it provides some additional flexibility for client code to act as
  528. * if parsing a fragment even when a full HTML tree isn't available.
  529. *
  530. * Default: GUMBO_TAG_LAST
  531. */
  532. GumboTag fragment_context;
  533. /**
  534. * The namespace for the fragment context. This lets client code
  535. * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
  536. * HTML.
  537. * Default: GUMBO_NAMESPACE_HTML
  538. */
  539. GumboNamespaceEnum fragment_namespace;
  540. } GumboOptions;
  541. /** Default options struct; use this with gumbo_parse_with_options. */
  542. extern const GumboOptions kGumboDefaultOptions;
  543. /** The output struct containing the results of the parse. */
  544. typedef struct GumboInternalOutput {
  545. /**
  546. * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
  547. * that contains the entire document as its child.
  548. */
  549. GumboNode* document;
  550. /**
  551. * Pointer to the root node. This the <html> tag that forms the root of the
  552. * document.
  553. */
  554. GumboNode* root;
  555. /**
  556. * A list of errors that occurred during the parse.
  557. * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
  558. * fleshed out and may change in the future. For this reason, the GumboError
  559. * header isn't part of the public API. Contact us if you need errors
  560. * reported so we can work out something appropriate for your use-case.
  561. */
  562. GumboVector /* GumboError */ errors;
  563. } GumboOutput;
  564. /**
  565. * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
  566. * live at least as long as the parse tree, as some fields (eg. original_text)
  567. * point directly into the original buffer.
  568. *
  569. * This doesn't support buffers longer than 4 gigabytes.
  570. */
  571. GumboOutput* gumbo_parse(const char* buffer);
  572. /**
  573. * Extended version of gumbo_parse that takes an explicit options structure,
  574. * buffer, and length.
  575. */
  576. GumboOutput* gumbo_parse_with_options(
  577. const GumboOptions* options, const char* buffer, size_t buffer_length);
  578. /** Release the memory used for the parse tree & parse errors. */
  579. void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
  580. #ifdef __cplusplus
  581. }
  582. #endif
  583. #endif // GUMBO_GUMBO_H_