HTMLparser.inc 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. (*
  2. * Summary: interface for an HTML 4.0 non-verifying parser
  3. * Description: this module implements an HTML 4.0 non-verifying parser
  4. * with API compatible with the XML parser ones. It should
  5. * be able to parse "real world" HTML, even if severely
  6. * broken from a specification point of view.
  7. *
  8. * Copy: See Copyright for the status of this software.
  9. *
  10. * Author: Daniel Veillard
  11. *)
  12. {$IFDEF LIBXML_HTML_ENABLED}
  13. {$IFDEF POINTER}
  14. htmlElemDescPtr = ^htmlElemDesc;
  15. htmlEntityDescPtr = ^htmlEntityDesc;
  16. {$ENDIF}
  17. {$IFDEF TYPE}
  18. (*
  19. * Most of the back-end structures from XML and HTML are shared.
  20. *)
  21. htmlParserCtxt = xmlParserCtxt;
  22. htmlParserCtxtPtr = xmlParserCtxtPtr;
  23. htmlParserNodeInfo = xmlParserNodeInfo;
  24. htmlSAXHandler = xmlSAXHandler;
  25. htmlSAXHandlerPtr = xmlSAXHandlerPtr;
  26. htmlParserInput = xmlParserInput;
  27. htmlParserInputPtr = xmlParserInputPtr;
  28. htmlDocPtr = xmlDocPtr;
  29. htmlNodePtr = xmlNodePtr;
  30. (*
  31. * Internal description of an HTML element, representing HTML 4.01
  32. * and XHTML 1.0 (which share the same structure).
  33. *)
  34. htmlElemDesc = record
  35. name : pchar; (* The tag name *)
  36. startTag : char; (* Whether the start tag can be implied *)
  37. endTag : char; (* Whether the end tag can be implied *)
  38. saveEndTag : char; (* Whether the end tag should be saved *)
  39. empty : char; (* Is this an empty element ? *)
  40. depr : char; (* Is this a deprecated element ? *)
  41. dtd : char; (* 1: only in Loose DTD, 2: only Frameset one *)
  42. isinline : char; (* is this a block 0 or inline 1 element *)
  43. desc : pchar; (* the description *)
  44. (* NRK Jan.2003
  45. * New fields encapsulating HTML structure
  46. *
  47. * Bugs:
  48. * This is a very limited representation. It fails to tell us when
  49. * an element *requires* subelements (we only have whether they're
  50. * allowed or not), and it doesn't tell us where CDATA and PCDATA
  51. * are allowed. Some element relationships are not fully represented:
  52. * these are flagged with the word MODIFIER
  53. *)
  54. subelts : ppchar; (* allowed sub-elements of this element *)
  55. defaultsubelt : pchar; (* subelement for suggested auto-repair
  56. if necessary or NULL *)
  57. attrs_opt : ppchar; (* Optional Attributes *)
  58. attrs_depr : ppchar; (* Additional deprecated attributes *)
  59. attrs_req : ppchar; (* Required attributes *)
  60. end;
  61. (*
  62. * Internal description of an HTML entity.
  63. *)
  64. htmlEntityDesc = record
  65. value : cuint; (* the UNICODE value for the character *)
  66. name : pchar; (* The entity name *)
  67. desc : pchar; (* the description *)
  68. end;
  69. {$ENDIF}
  70. {$IFDEF FUNCTION}
  71. (*
  72. * There is only few public functions.
  73. *)
  74. function htmlTagLookup(tag: xmlCharPtr): htmlElemDescPtr; EXTDECL; external xml2lib;
  75. function htmlEntityLookup(tag: xmlCharPtr): htmlEntityDescPtr; EXTDECL; external xml2lib;
  76. function htmlEntityValueLookup(value: cuint): htmlEntityDescPtr; EXTDECL; external xml2lib;
  77. function htmlIsAutoClosed(doc: htmlDocPtr; elem: htmlNodePtr): cint; EXTDECL; external xml2lib;
  78. function htmlAutoCloseTag(doc: htmlDocPtr; name: xmlCharPtr; elem: htmlNodePtr): cint; EXTDECL; external xml2lib;
  79. function htmlParseEntityRef(ctxt: htmlParserCtxtPtr; str: xmlCharPtrPtr): htmlEntityDescPtr; EXTDECL; external xml2lib;
  80. function htmlParseCharRef(ctxt: htmlParserCtxtPtr): cint; EXTDECL; external xml2lib;
  81. function htmlParseElement(ctxt: htmlParserCtxtPtr): cint; EXTDECL; external xml2lib;
  82. function htmlNewParserCtxt: htmlParserCtxtPtr; EXTDECL; external xml2lib;
  83. function htmlCreateMemoryParserCtxt(buffer: pchar; size: cint): htmlParserCtxtPtr; EXTDECL; external xml2lib;
  84. function htmlParseDocument(doc: htmlDocPtr; elem: htmlNodePtr): cint; EXTDECL; external xml2lib;
  85. function htmlSAXParseDoc(cur: xmlCharPtr; encoding: pchar; sax: htmlSAXHandlerPtr; userdata: pointer): htmlDocPtr; EXTDECL; external xml2lib;
  86. function htmlParseDoc(cur: xmlCharPtr; encoding: pchar): htmlDocPtr; EXTDECL; external xml2lib;
  87. function htmlSAXParseFile(filename, encoding: pchar; sax: htmlSAXHandlerPtr; userdata: pointer): htmlDocPtr; EXTDECL; external xml2lib;
  88. function htmlParseFile(filename, encoding: pchar): htmlDocPtr; EXTDECL; external xml2lib;
  89. function UTF8ToHtml(_out: pointer; outlen: cint; _in: pointer; inlen: cint): cint; EXTDECL; external xml2lib;
  90. function htmlEncodeEntities(_out: pointer; outlen: cint; _in: pointer; inlen, quoteChar: cint): cint; EXTDECL; external xml2lib;
  91. function htmlIsScriptAttribute(name: xmlCharPtr): cint; EXTDECL; external xml2lib;
  92. function htmlHandleOmittedElem(val: cint): cint; EXTDECL; external xml2lib;
  93. {$IFDEF LIBXML_PUSH_ENABLED}
  94. (**
  95. * Interfaces for the Push mode.
  96. *)
  97. function htmlCreatePushParserCtxt(sax: htmlSAXHandlerPtr; userdata: pointer; chunk: pchar; size: cint; filename: pchar; enc: xmlCharEncoding): htmlParserCtxtPtr; EXTDECL; external xml2lib;
  98. function htmlParseChunk(ctxt: htmlParserCtxtPtr; chunk: pchar; size, terminate: cint): htmlParserCtxtPtr; EXTDECL; external xml2lib;
  99. {$ENDIF} (* LIBXML_PUSH_ENABLED *)
  100. procedure htmlFreeParserCtxt(ctxt: htmlParserCtxtPtr); EXTDECL; external xml2lib;
  101. {$ENDIF}
  102. {$IFDEF TYPE}
  103. htmlParserOption = type cint;
  104. {$ENDIF}
  105. {$IFDEF CONST}
  106. (*
  107. * New set of simpler/more flexible APIs
  108. *)
  109. (**
  110. * xmlParserOption:
  111. *
  112. * This is the set of XML parser options that can be passed down
  113. * to the xmlReadDoc() and similar calls.
  114. *)
  115. HTML_PARSE_RECOVER = 1 shl 0; (* Relaxed parsing *)
  116. HTML_PARSE_NOERROR = 1 shl 5; (* suppress error reports *)
  117. HTML_PARSE_NOWARNING= 1 shl 6; (* suppress warning reports *)
  118. HTML_PARSE_PEDANTIC = 1 shl 7; (* pedantic error reporting *)
  119. HTML_PARSE_NOBLANKS = 1 shl 8; (* remove blank nodes *)
  120. HTML_PARSE_NONET = 1 shl 11;(* Forbid network access *)
  121. HTML_PARSE_COMPACT = 1 shl 16; (* compact small text nodes *)
  122. {$ENDIF}
  123. {$IFDEF FUNCTION}
  124. procedure htmlCtxtReset(ctxt: htmlParserCtxtPtr); EXTDECL; external xml2lib;
  125. function htmlParseChunk(ctxt: htmlParserCtxtPtr; options: cint): cint; EXTDECL; external xml2lib;
  126. function htmlReadDoc(cur: xmlCharPtr; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  127. function htmlReadFile(URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  128. function htmlReadMemory(buffer: pchar; size: cint; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  129. function htmlReadFd(fd: cint; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  130. function htmlReadIO(ioread: xmlInputReadCallback; ioclose: xmlInputCloseCallback; ioctx: pointer; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  131. function htmlCtxtReadDoc(ctxt: xmlParserCtxtPtr; cur: xmlCharPtr; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  132. function htmlCtxtReadFile(ctxt: xmlParserCtxtPtr; filename, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  133. function htmlCtxtReadMemory(ctxt: xmlParserCtxtPtr; buffer: pchar; size: cint; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  134. function htmlCtxtReadFd(ctxt: xmlParserCtxtPtr; fd: cint; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  135. function htmlCtxtReadIO(ctxt: xmlParserCtxtPtr; ioread: xmlInputReadCallback; ioclose: xmlInputCloseCallback; ioctx: pointer; URL, encoding: pchar; options: cint): htmlDocPtr; EXTDECL; external xml2lib;
  136. {$ENDIF}
  137. {$IFDEF TYPE}
  138. htmlStatus = type cint;
  139. {$ENDIF}
  140. {$IFDEF CONST}
  141. (* NRK/Jan2003: further knowledge of HTML structure
  142. *)
  143. HTML_NA = $0; (* something we don't check at all *)
  144. HTML_INVALID = $1;
  145. HTML_DEPRECATED = $2;
  146. HTML_VALID = $4;
  147. HTML_REQUIRED = $c; (* VALID bit set so ( & HTML_VALID ) is TRUE *)
  148. {$ENDIF}
  149. {$IFDEF FUNCTION}
  150. (* Using htmlElemDesc rather than name here, to emphasise the fact
  151. that otherwise there's a lookup overhead
  152. *)
  153. function htmlAttrAllowed(desc: htmlElemDescPtr; str: xmlCharPtr; val: cint): htmlStatus; EXTDECL; external xml2lib;
  154. function htmlElementAllowedHere(desc: htmlElemDescPtr; str: xmlCharPtr): cint; EXTDECL; external xml2lib;
  155. function htmlAttrAllowed(desc1, desc2: htmlElemDescPtr): htmlStatus; EXTDECL; external xml2lib;
  156. function htmlNodeStatus(node: htmlNodePtr; val: cint): htmlStatus; EXTDECL; external xml2lib;
  157. (**
  158. * htmlDefaultSubelement:
  159. * @elt: HTML element
  160. *
  161. * Returns the default subelement for this element
  162. *)
  163. function htmlDefaultSubelement(elt: htmlElemDescPtr): pchar;
  164. (**
  165. * htmlElementAllowedHereDesc:
  166. * @parent: HTML parent element
  167. * @elt: HTML element
  168. *
  169. * Checks whether an HTML element description may be a
  170. * direct child of the specified element.
  171. *
  172. * Returns 1 if allowed; 0 otherwise.
  173. *)
  174. function htmlElementAllowedHereDesc(parent: htmlElemDescPtr; elt: htmlElemDescPtr): cint;
  175. (**
  176. * htmlRequiredAttrs:
  177. * @elt: HTML element
  178. *
  179. * Returns the attributes required for the specified element.
  180. *)
  181. function htmlRequiredAttrs(elt: htmlElemDescPtr): ppchar;
  182. {$ENDIF}
  183. {$ENDIF} (* LIBXML_HTML_ENABLED *)