HTMLparser.h 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. (*
  2. * Summary: interface for an HTML 4.0 non-verifying parser
  3. * Description: this module implements an HTML 4.0 non-verifying parser
  4. * with API compatible with the XML parser ones. It should
  5. * be able to parse "real world" HTML, even if severely
  6. * broken from a specification point of view.
  7. *
  8. * Copy: See Copyright for the status of this software.
  9. *
  10. * Author: Daniel Veillard
  11. *)
  12. #ifndef __HTML_PARSER_H__
  13. #define __HTML_PARSER_H__
  14. #include <libxml/xmlversion.h>
  15. #include <libxml/parser.h>
  16. { LIBXML_HTML_ENABLED
  17. { __cplusplus
  18. extern "C" {
  19. #endif
  20. (*
  21. * Most of the back-end structures from XML and HTML are shared.
  22. *)
  23. typedef xmlParserCtxt htmlParserCtxt;
  24. typedef xmlParserCtxtPtr htmlParserCtxtPtr;
  25. typedef xmlParserNodeInfo htmlParserNodeInfo;
  26. typedef xmlSAXHandler htmlSAXHandler;
  27. typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
  28. typedef xmlParserInput htmlParserInput;
  29. typedef xmlParserInputPtr htmlParserInputPtr;
  30. typedef xmlDocPtr htmlDocPtr;
  31. typedef xmlNodePtr htmlNodePtr;
  32. (*
  33. * Internal description of an HTML element, representing HTML 4.01
  34. * and XHTML 1.0 (which share the same structure).
  35. *)
  36. typedef struct _htmlElemDesc htmlElemDesc;
  37. typedef htmlElemDesc *htmlElemDescPtr;
  38. struct _htmlElemDesc {
  39. char *name; (* The tag name *)
  40. char startTag; (* Whether the start tag can be implied *)
  41. char endTag; (* Whether the end tag can be implied *)
  42. char saveEndTag; (* Whether the end tag should be saved *)
  43. char empty; (* Is this an empty element ? *)
  44. char depr; (* Is this a deprecated element ? *)
  45. char dtd; (* 1: only in Loose DTD, 2: only Frameset one *)
  46. char isinline; (* is this a block 0 or inline 1 element *)
  47. char *desc; (* the description *)
  48. (* NRK Jan.2003
  49. * New fields encapsulating HTML structure
  50. *
  51. * Bugs:
  52. * This is a very limited representation. It fails to tell us when
  53. * an element *requires* subelements (we only have whether they're
  54. * allowed or not), and it doesn't tell us where CDATA and PCDATA
  55. * are allowed. Some element relationships are not fully represented:
  56. * these are flagged with the word MODIFIER
  57. *)
  58. char** subelts; (* allowed sub-elements of this element *)
  59. char* defaultsubelt; (* subelement for suggested auto-repair
  60. if necessary or NULL *)
  61. char** attrs_opt; (* Optional Attributes *)
  62. char** attrs_depr; (* Additional deprecated attributes *)
  63. char** attrs_req; (* Required attributes *)
  64. };
  65. (*
  66. * Internal description of an HTML entity.
  67. *)
  68. typedef struct _htmlEntityDesc htmlEntityDesc;
  69. typedef htmlEntityDesc *htmlEntityDescPtr;
  70. struct _htmlEntityDesc {
  71. unsigned int value; (* the UNICODE value for the character *)
  72. char *name; (* The entity name *)
  73. char *desc; (* the description *)
  74. };
  75. (*
  76. * There is only few public functions.
  77. *)
  78. XMLPUBFUN htmlElemDesc * XMLCALL
  79. htmlTagLookup (xmlChar *tag);
  80. XMLPUBFUN htmlEntityDesc * XMLCALL
  81. htmlEntityLookup(xmlChar *name);
  82. XMLPUBFUN htmlEntityDesc * XMLCALL
  83. htmlEntityValueLookup(unsigned int value);
  84. XMLPUBFUN int XMLCALL
  85. htmlIsAutoClosed(htmlDocPtr doc,
  86. htmlNodePtr elem);
  87. XMLPUBFUN int XMLCALL
  88. htmlAutoCloseTag(htmlDocPtr doc,
  89. xmlChar *name,
  90. htmlNodePtr elem);
  91. XMLPUBFUN htmlEntityDesc * XMLCALL
  92. htmlParseEntityRef(htmlParserCtxtPtr ctxt,
  93. xmlChar **str);
  94. XMLPUBFUN int XMLCALL
  95. htmlParseCharRef(htmlParserCtxtPtr ctxt);
  96. XMLPUBFUN void XMLCALL
  97. htmlParseElement(htmlParserCtxtPtr ctxt);
  98. XMLPUBFUN htmlParserCtxtPtr XMLCALL
  99. htmlNewParserCtxt(void);
  100. XMLPUBFUN htmlParserCtxtPtr XMLCALL
  101. htmlCreateMemoryParserCtxt(char *buffer,
  102. int size);
  103. XMLPUBFUN int XMLCALL
  104. htmlParseDocument(htmlParserCtxtPtr ctxt);
  105. XMLPUBFUN htmlDocPtr XMLCALL
  106. htmlSAXParseDoc (xmlChar *cur,
  107. char *encoding,
  108. htmlSAXHandlerPtr sax,
  109. void *userData);
  110. XMLPUBFUN htmlDocPtr XMLCALL
  111. htmlParseDoc (xmlChar *cur,
  112. char *encoding);
  113. XMLPUBFUN htmlDocPtr XMLCALL
  114. htmlSAXParseFile(char *filename,
  115. char *encoding,
  116. htmlSAXHandlerPtr sax,
  117. void *userData);
  118. XMLPUBFUN htmlDocPtr XMLCALL
  119. htmlParseFile (char *filename,
  120. char *encoding);
  121. XMLPUBFUN int XMLCALL
  122. UTF8ToHtml (unsigned char *out,
  123. int *outlen,
  124. unsigned char *in,
  125. int *inlen);
  126. XMLPUBFUN int XMLCALL
  127. htmlEncodeEntities(unsigned char *out,
  128. int *outlen,
  129. unsigned char *in,
  130. int *inlen, int quoteChar);
  131. XMLPUBFUN int XMLCALL
  132. htmlIsScriptAttribute(xmlChar *name);
  133. XMLPUBFUN int XMLCALL
  134. htmlHandleOmittedElem(int val);
  135. { LIBXML_PUSH_ENABLED
  136. (**
  137. * Interfaces for the Push mode.
  138. *)
  139. XMLPUBFUN htmlParserCtxtPtr XMLCALL
  140. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
  141. void *user_data,
  142. char *chunk,
  143. int size,
  144. char *filename,
  145. xmlCharEncoding enc);
  146. XMLPUBFUN int XMLCALL
  147. htmlParseChunk (htmlParserCtxtPtr ctxt,
  148. char *chunk,
  149. int size,
  150. int terminate);
  151. #endif (* LIBXML_PUSH_ENABLED *)
  152. XMLPUBFUN void XMLCALL
  153. htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
  154. (*
  155. * New set of simpler/more flexible APIs
  156. *)
  157. (**
  158. * xmlParserOption:
  159. *
  160. * This is the set of XML parser options that can be passed down
  161. * to the xmlReadDoc() and similar calls.
  162. *)
  163. typedef enum {
  164. HTML_PARSE_RECOVER = 1<<0, (* Relaxed parsing *)
  165. HTML_PARSE_NOERROR = 1<<5, (* suppress error reports *)
  166. HTML_PARSE_NOWARNING= 1<<6, (* suppress warning reports *)
  167. HTML_PARSE_PEDANTIC = 1<<7, (* pedantic error reporting *)
  168. HTML_PARSE_NOBLANKS = 1<<8, (* remove blank nodes *)
  169. HTML_PARSE_NONET = 1<<11,(* Forbid network access *)
  170. HTML_PARSE_COMPACT = 1<<16 (* compact small text nodes *)
  171. } htmlParserOption;
  172. XMLPUBFUN void XMLCALL
  173. htmlCtxtReset (htmlParserCtxtPtr ctxt);
  174. XMLPUBFUN int XMLCALL
  175. htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
  176. int options);
  177. XMLPUBFUN htmlDocPtr XMLCALL
  178. htmlReadDoc (xmlChar *cur,
  179. char *URL,
  180. char *encoding,
  181. int options);
  182. XMLPUBFUN htmlDocPtr XMLCALL
  183. htmlReadFile (char *URL,
  184. char *encoding,
  185. int options);
  186. XMLPUBFUN htmlDocPtr XMLCALL
  187. htmlReadMemory (char *buffer,
  188. int size,
  189. char *URL,
  190. char *encoding,
  191. int options);
  192. XMLPUBFUN htmlDocPtr XMLCALL
  193. htmlReadFd (int fd,
  194. char *URL,
  195. char *encoding,
  196. int options);
  197. XMLPUBFUN htmlDocPtr XMLCALL
  198. htmlReadIO (xmlInputReadCallback ioread,
  199. xmlInputCloseCallback ioclose,
  200. void *ioctx,
  201. char *URL,
  202. char *encoding,
  203. int options);
  204. XMLPUBFUN htmlDocPtr XMLCALL
  205. htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
  206. xmlChar *cur,
  207. char *URL,
  208. char *encoding,
  209. int options);
  210. XMLPUBFUN htmlDocPtr XMLCALL
  211. htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
  212. char *filename,
  213. char *encoding,
  214. int options);
  215. XMLPUBFUN htmlDocPtr XMLCALL
  216. htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
  217. char *buffer,
  218. int size,
  219. char *URL,
  220. char *encoding,
  221. int options);
  222. XMLPUBFUN htmlDocPtr XMLCALL
  223. htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
  224. int fd,
  225. char *URL,
  226. char *encoding,
  227. int options);
  228. XMLPUBFUN htmlDocPtr XMLCALL
  229. htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
  230. xmlInputReadCallback ioread,
  231. xmlInputCloseCallback ioclose,
  232. void *ioctx,
  233. char *URL,
  234. char *encoding,
  235. int options);
  236. (* NRK/Jan2003: further knowledge of HTML structure
  237. *)
  238. typedef enum {
  239. HTML_NA = 0 , (* something we don't check at all *)
  240. HTML_INVALID = 0x1 ,
  241. HTML_DEPRECATED = 0x2 ,
  242. HTML_VALID = 0x4 ,
  243. HTML_REQUIRED = 0xc (* VALID bit set so ( & HTML_VALID ) is TRUE *)
  244. } htmlStatus ;
  245. (* Using htmlElemDesc rather than name here, to emphasise the fact
  246. that otherwise there's a lookup overhead
  247. *)
  248. XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(htmlElemDesc*, xmlChar*, int) ;
  249. XMLPUBFUN int XMLCALL htmlElementAllowedHere(htmlElemDesc*, xmlChar*) ;
  250. XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(htmlElemDesc*, htmlElemDesc*) ;
  251. XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(htmlNodePtr, int) ;
  252. (**
  253. * htmlDefaultSubelement:
  254. * @elt: HTML element
  255. *
  256. * Returns the default subelement for this element
  257. *)
  258. #define htmlDefaultSubelement(elt) elt->defaultsubelt
  259. (**
  260. * htmlElementAllowedHereDesc:
  261. * @parent: HTML parent element
  262. * @elt: HTML element
  263. *
  264. * Checks whether an HTML element description may be a
  265. * direct child of the specified element.
  266. *
  267. * Returns 1 if allowed; 0 otherwise.
  268. *)
  269. #define htmlElementAllowedHereDesc(parent,elt) \
  270. htmlElementAllowedHere((parent), (elt)->name)
  271. (**
  272. * htmlRequiredAttrs:
  273. * @elt: HTML element
  274. *
  275. * Returns the attributes required for the specified element.
  276. *)
  277. #define htmlRequiredAttrs(elt) (elt)->attrs_req
  278. { __cplusplus
  279. }
  280. #endif
  281. #endif (* LIBXML_HTML_ENABLED *)
  282. #endif (* __HTML_PARSER_H__ *)