error.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. // Copyright 2010 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Author: [email protected] (Jonathan Tang)
  16. #include "error.h"
  17. #include <assert.h>
  18. #include <stdarg.h>
  19. #include <stdio.h>
  20. #include <string.h>
  21. #include "gumbo.h"
  22. #include "parser.h"
  23. #include "string_buffer.h"
  24. #include "util.h"
  25. #include "vector.h"
  26. // Prints a formatted message to a StringBuffer. This automatically resizes the
  27. // StringBuffer as necessary to fit the message. Returns the number of bytes
  28. // written.
  29. static int print_message(
  30. GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
  31. va_list args;
  32. int remaining_capacity = output->capacity - output->length;
  33. va_start(args, format);
  34. int bytes_written = vsnprintf(
  35. output->data + output->length, remaining_capacity, format, args);
  36. va_end(args);
  37. #ifdef _MSC_VER
  38. if (bytes_written == -1) {
  39. // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
  40. // returning the number of bytes that would've been written had there been
  41. // enough. In this case, we'll double the buffer size and hope it fits when
  42. // we retry (letting it fail and returning 0 if it doesn't), since there's
  43. // no way to smartly resize the buffer.
  44. gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
  45. va_start(args, format);
  46. int result = vsnprintf(
  47. output->data + output->length, remaining_capacity, format, args);
  48. va_end(args);
  49. return result == -1 ? 0 : result;
  50. }
  51. #else
  52. // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
  53. if (bytes_written == -1) {
  54. return 0;
  55. }
  56. #endif
  57. if (bytes_written > remaining_capacity) {
  58. gumbo_string_buffer_reserve(
  59. parser, output->capacity + bytes_written, output);
  60. remaining_capacity = output->capacity - output->length;
  61. va_start(args, format);
  62. bytes_written = vsnprintf(
  63. output->data + output->length, remaining_capacity, format, args);
  64. va_end(args);
  65. }
  66. output->length += bytes_written;
  67. return bytes_written;
  68. }
  69. static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
  70. GumboStringBuffer* output) {
  71. print_message(parser, output, " Currently open tags: ");
  72. for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
  73. if (i) {
  74. print_message(parser, output, ", ");
  75. }
  76. GumboTag tag = (GumboTag) error->tag_stack.data[i];
  77. print_message(parser, output, gumbo_normalized_tagname(tag));
  78. }
  79. gumbo_string_buffer_append_codepoint(parser, '.', output);
  80. }
  81. static void handle_parser_error(GumboParser* parser,
  82. const GumboParserError* error, GumboStringBuffer* output) {
  83. if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
  84. error->input_type != GUMBO_TOKEN_DOCTYPE) {
  85. print_message(
  86. parser, output, "The doctype must be the first token in the document");
  87. return;
  88. }
  89. switch (error->input_type) {
  90. case GUMBO_TOKEN_DOCTYPE:
  91. print_message(parser, output, "This is not a legal doctype");
  92. return;
  93. case GUMBO_TOKEN_COMMENT:
  94. // Should never happen; comments are always legal.
  95. assert(0);
  96. // But just in case...
  97. print_message(parser, output, "Comments aren't legal here");
  98. return;
  99. case GUMBO_TOKEN_CDATA:
  100. case GUMBO_TOKEN_WHITESPACE:
  101. case GUMBO_TOKEN_CHARACTER:
  102. print_message(parser, output, "Character tokens aren't legal here");
  103. return;
  104. case GUMBO_TOKEN_NULL:
  105. print_message(parser, output, "Null bytes are not allowed in HTML5");
  106. return;
  107. case GUMBO_TOKEN_EOF:
  108. if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
  109. print_message(parser, output, "You must provide a doctype");
  110. } else {
  111. print_message(parser, output, "Premature end of file");
  112. print_tag_stack(parser, error, output);
  113. }
  114. return;
  115. case GUMBO_TOKEN_START_TAG:
  116. case GUMBO_TOKEN_END_TAG:
  117. print_message(parser, output, "That tag isn't allowed here");
  118. print_tag_stack(parser, error, output);
  119. // TODO(jdtang): Give more specific messaging.
  120. return;
  121. }
  122. }
  123. // Finds the preceding newline in an original source buffer from a given byte
  124. // location. Returns a character pointer to the character after that, or a
  125. // pointer to the beginning of the string if this is the first line.
  126. static const char* find_last_newline(
  127. const char* original_text, const char* error_location) {
  128. assert(error_location >= original_text);
  129. const char* c = error_location;
  130. for (; c != original_text && *c != '\n'; --c) {
  131. // There may be an error at EOF, which would be a nul byte.
  132. assert(*c || c == error_location);
  133. }
  134. return c == original_text ? c : c + 1;
  135. }
  136. // Finds the next newline in the original source buffer from a given byte
  137. // location. Returns a character pointer to that newline, or a pointer to the
  138. // terminating null byte if this is the last line.
  139. static const char* find_next_newline(
  140. const char* original_text, const char* error_location) {
  141. const char* c = error_location;
  142. for (; *c && *c != '\n'; ++c)
  143. ;
  144. return c;
  145. }
  146. GumboError* gumbo_add_error(GumboParser* parser) {
  147. int max_errors = parser->_options->max_errors;
  148. if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
  149. return NULL;
  150. }
  151. GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
  152. gumbo_vector_add(parser, error, &parser->_output->errors);
  153. return error;
  154. }
  155. void gumbo_error_to_string(
  156. GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
  157. print_message(
  158. parser, output, "@%d:%d: ", error->position.line, error->position.column);
  159. switch (error->type) {
  160. case GUMBO_ERR_UTF8_INVALID:
  161. print_message(
  162. parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
  163. break;
  164. case GUMBO_ERR_UTF8_TRUNCATED:
  165. print_message(parser, output,
  166. "Input stream ends with a truncated UTF8 character 0x%x",
  167. error->v.codepoint);
  168. break;
  169. case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
  170. print_message(
  171. parser, output, "No digits after &# in numeric character reference");
  172. break;
  173. case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
  174. print_message(parser, output,
  175. "The numeric character reference &#%d should be followed "
  176. "by a semicolon",
  177. error->v.codepoint);
  178. break;
  179. case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
  180. print_message(parser, output,
  181. "The numeric character reference &#%d; encodes an invalid "
  182. "unicode codepoint",
  183. error->v.codepoint);
  184. break;
  185. case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
  186. // The textual data came from one of the literal strings in the table, and
  187. // so it'll be null-terminated.
  188. print_message(parser, output,
  189. "The named character reference &%.*s should be followed by a "
  190. "semicolon",
  191. (int) error->v.text.length, error->v.text.data);
  192. break;
  193. case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
  194. print_message(parser, output,
  195. "The named character reference &%.*s; is not a valid entity name",
  196. (int) error->v.text.length, error->v.text.data);
  197. break;
  198. case GUMBO_ERR_DUPLICATE_ATTR:
  199. print_message(parser, output,
  200. "Attribute %s occurs multiple times, at positions %d and %d",
  201. error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
  202. error->v.duplicate_attr.new_index);
  203. break;
  204. case GUMBO_ERR_PARSER:
  205. case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
  206. handle_parser_error(parser, &error->v.parser, output);
  207. break;
  208. default:
  209. print_message(parser, output,
  210. "Tokenizer error with an unimplemented error message");
  211. break;
  212. }
  213. gumbo_string_buffer_append_codepoint(parser, '.', output);
  214. }
  215. void gumbo_caret_diagnostic_to_string(GumboParser* parser,
  216. const GumboError* error, const char* source_text,
  217. GumboStringBuffer* output) {
  218. gumbo_error_to_string(parser, error, output);
  219. const char* line_start = find_last_newline(source_text, error->original_text);
  220. const char* line_end = find_next_newline(source_text, error->original_text);
  221. GumboStringPiece original_line;
  222. original_line.data = line_start;
  223. original_line.length = line_end - line_start;
  224. gumbo_string_buffer_append_codepoint(parser, '\n', output);
  225. gumbo_string_buffer_append_string(parser, &original_line, output);
  226. gumbo_string_buffer_append_codepoint(parser, '\n', output);
  227. gumbo_string_buffer_reserve(
  228. parser, output->length + error->position.column, output);
  229. int num_spaces = error->position.column - 1;
  230. memset(output->data + output->length, ' ', num_spaces);
  231. output->length += num_spaces;
  232. gumbo_string_buffer_append_codepoint(parser, '^', output);
  233. gumbo_string_buffer_append_codepoint(parser, '\n', output);
  234. }
  235. void gumbo_print_caret_diagnostic(
  236. GumboParser* parser, const GumboError* error, const char* source_text) {
  237. GumboStringBuffer text;
  238. gumbo_string_buffer_init(parser, &text);
  239. gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
  240. printf("%.*s", (int) text.length, text.data);
  241. gumbo_string_buffer_destroy(parser, &text);
  242. }
  243. void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
  244. if (error->type == GUMBO_ERR_PARSER ||
  245. error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
  246. gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
  247. } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
  248. gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
  249. }
  250. gumbo_parser_deallocate(parser, error);
  251. }
  252. void gumbo_init_errors(GumboParser* parser) {
  253. gumbo_vector_init(parser, 5, &parser->_output->errors);
  254. }
  255. void gumbo_destroy_errors(GumboParser* parser) {
  256. for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
  257. gumbo_error_destroy(parser, parser->_output->errors.data[i]);
  258. }
  259. gumbo_vector_destroy(parser, &parser->_output->errors);
  260. }