sq_gumbo.cpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. /// @module gumbo
  2. // Lua bindings for the [Gumbo][] HTML5 parsing library.
  3. // [Gumbo]: https://github.com/google/gumbo-parser
  4. // @copyright 2013 Craig Barnes
  5. // @license ISC
  6. // Ported to Squilu by Domingo Alvarez Duarte
  7. #include "squirrel.h"
  8. #include <string.h>
  9. #include "sqstdblobimpl.h"
  10. SQ_OPT_STRING_STRLEN();
  11. #include "gumbo.h"
  12. #include <stdio.h>
  13. #include <errno.h>
  14. #define MYNAME _SC("gumbo")
  15. #define assert(cond) if (!(cond)) goto error
  16. static SQRESULT build_node(HSQUIRRELVM v, GumboNode* node);
  17. static void reg_string(HSQUIRRELVM v, const char *name, const char *val) {
  18. sq_pushstring(v, name, -1);
  19. if(val) sq_pushstring(v, val, -1);
  20. else sq_pushnull(v);
  21. sq_rawset(v, -3);
  22. }
  23. static void reg_boolean(HSQUIRRELVM v, const char *name, SQBool val) {
  24. sq_pushstring(v, name, -1);
  25. sq_pushbool(v, val);
  26. sq_rawset(v, -3);
  27. }
  28. static inline SQRESULT add_children(HSQUIRRELVM v, GumboVector *children) {
  29. unsigned int tl = 0;
  30. for (unsigned int i = 0, cl = children->length; i < cl; i++) {
  31. switch(build_node(v, (GumboNode*)children->data[i])){
  32. case SQTrue:
  33. sq_arrayset(v, -2, tl++);
  34. break;
  35. case SQFalse:
  36. break;
  37. case SQ_ERROR:
  38. return SQ_ERROR;
  39. }
  40. }
  41. if(tl < children->length) {
  42. sq_arrayresize(v, -1, tl);
  43. }
  44. return SQ_OK;
  45. }
  46. static SQRESULT build_document(HSQUIRRELVM v, GumboDocument *document) {
  47. sq_newtableex(v, 6);
  48. reg_string(v, _SC("name"), document->name);
  49. reg_string(v, _SC("public_identifier"), document->public_identifier);
  50. reg_string(v, _SC("system_identifier"), document->system_identifier);
  51. reg_boolean(v, _SC("has_doctype"), document->has_doctype ? SQTrue : SQFalse);
  52. sq_pushliteral(v, _SC("children"));
  53. sq_newarray(v, document->children.length);
  54. if(add_children(v, &document->children) == SQ_ERROR) return SQ_ERROR;
  55. sq_rawset(v, -3);
  56. return 1;
  57. }
  58. static SQRESULT build_element(HSQUIRRELVM v, GumboElement *element) {
  59. unsigned int nattrs = element->attributes.length;
  60. sq_newtableex(v, nattrs ? 3 : 2);
  61. // Add tag name
  62. sq_pushliteral(v, _SC("tag"));
  63. if (element->tag == GUMBO_TAG_UNKNOWN) {
  64. GumboStringPiece original_tag = element->original_tag;
  65. gumbo_tag_from_original_text(&original_tag);
  66. sq_pushstring(v, original_tag.data, original_tag.length);
  67. } else {
  68. sq_pushstring(v, gumbo_normalized_tagname(element->tag), -1);
  69. }
  70. sq_rawset(v, -3);
  71. // Add attributes
  72. if (nattrs) {
  73. sq_pushliteral(v, _SC("attr"));
  74. sq_newtableex(v, nattrs);
  75. for (unsigned int i = 0; i < nattrs; ++i) {
  76. GumboAttribute *attribute = (GumboAttribute *)element->attributes.data[i];
  77. reg_string(v, attribute->name, attribute->value);
  78. }
  79. sq_rawset(v, -3);
  80. }
  81. sq_pushliteral(v, _SC("children"));
  82. sq_newarray(v, element->children.length);
  83. if(add_children(v, &element->children) == SQ_ERROR) return SQ_ERROR;
  84. sq_rawset(v, -3);
  85. return SQ_OK;
  86. }
  87. static SQRESULT build_node(HSQUIRRELVM v, GumboNode* node) {
  88. switch (node->type) {
  89. case GUMBO_NODE_DOCUMENT:
  90. build_document(v, &node->v.document);
  91. return SQTrue;
  92. case GUMBO_NODE_ELEMENT:
  93. build_element(v, &node->v.element);
  94. return SQTrue;
  95. case GUMBO_NODE_COMMENT:
  96. sq_newtableex(v, 1);
  97. reg_string(v, _SC("comment"), node->v.text.text);
  98. return SQTrue;
  99. case GUMBO_NODE_TEXT:
  100. case GUMBO_NODE_CDATA:
  101. sq_pushstring(v, node->v.text.text, -1);
  102. return SQTrue;
  103. case GUMBO_NODE_WHITESPACE:
  104. return SQFalse;
  105. default:
  106. return sq_throwerror(v, _SC("Invalid node type"));
  107. }
  108. }
  109. static inline SQRESULT parse(HSQUIRRELVM v, const SQChar *input, SQInteger len) {
  110. GumboOutput *output;
  111. output = gumbo_parse_with_options(&kGumboDefaultOptions, input, len);
  112. SQRESULT result = build_node(v, output->document);
  113. if(result == SQ_ERROR) {
  114. gumbo_destroy_output(&kGumboDefaultOptions, output);
  115. return SQ_ERROR;
  116. }
  117. sq_pushliteral(v, _SC("children"));
  118. sq_rawget(v, -2);
  119. sq_pushliteral(v, _SC("root"));
  120. sq_arrayget(v, -2, output->root->index_within_parent);
  121. sq_rawset(v, -4); //set root on main table
  122. sq_poptop(v); //remove children array from stack
  123. gumbo_destroy_output(&kGumboDefaultOptions, output);
  124. return result;
  125. }
  126. /// Parse a string of HTML
  127. // @function parse
  128. // @param document String containing HTML
  129. // @return Abstract syntax tree table
  130. // @see README.md
  131. static SQRESULT gumbo_parse(HSQUIRRELVM v) /** parse(s) */
  132. {
  133. SQ_FUNC_VARS_NO_TOP(v);
  134. SQ_GET_STRING(v, 2, input);
  135. return parse(v, input, input_size);
  136. }
  137. /// Read and parse a HTML file
  138. // @function parse_file
  139. // @param filename Path to HTML file
  140. // @return Abstract syntax tree table
  141. // @throw exception (if opening or reading file fails)
  142. static SQRESULT gumbo_parse_file(HSQUIRRELVM v) /** parse_file(s) */
  143. {
  144. SQRESULT result;
  145. SQ_FUNC_VARS_NO_TOP(v);
  146. SQ_GET_STRING(v, 2, filename);
  147. FILE *file = NULL;
  148. char *input = NULL;
  149. long len;
  150. assert(file = fopen(filename, "rb"));
  151. assert(fseek(file, 0, SEEK_END) != -1);
  152. assert((len = ftell(file)) != -1);
  153. rewind(file);
  154. assert(input = (char*)sq_malloc(len + 1));
  155. assert(fread(input, 1, len, file) == (unsigned long)len);
  156. fclose(file);
  157. input[len] = '\0';
  158. result = parse(v, input, len);
  159. sq_free(input, len+1);
  160. return result;
  161. error: // Return nil and an error message if an assertion fails
  162. if (file) fclose(file);
  163. if (input) sq_free(input, len+1);
  164. return sq_throwerror(v, strerror(errno));
  165. }
  166. #define _DECL_FUNC(name,nparams,tycheck) {_SC(#name),gumbo_##name,nparams,tycheck}
  167. static SQRegFunction gumbo_methods[] =
  168. {
  169. _DECL_FUNC(parse,2,_SC(".s")),
  170. _DECL_FUNC(parse_file,2,_SC(".s")),
  171. {0,0}
  172. };
  173. #undef _DECL_FUNC
  174. #ifdef __cplusplus
  175. extern "C" {
  176. #endif
  177. SQRESULT sqext_register_gumbo(HSQUIRRELVM v)
  178. {
  179. sq_pushstring(v,_SC("gumbo"),-1);
  180. sq_newtable(v);
  181. sq_insert_reg_funcs(v, gumbo_methods);
  182. sq_newslot(v,-3,SQTrue);
  183. return 0;
  184. }
  185. #ifdef __cplusplus
  186. }
  187. #endif