| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- // Copyright 2010 Google Inc. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- // Author: [email protected] (Jonathan Tang)
- #include "error.h"
- #include <assert.h>
- #include <stdarg.h>
- #include <stdio.h>
- #include <string.h>
- #include "gumbo.h"
- #include "parser.h"
- #include "string_buffer.h"
- #include "util.h"
- #include "vector.h"
- static const size_t kMessageBufferSize = 256;
- // Prints a formatted message to a StringBuffer. This automatically resizes the
- // StringBuffer as necessary to fit the message. Returns the number of bytes
- // written.
- static int print_message(GumboParser* parser, GumboStringBuffer* output,
- const char* format, ...) {
- va_list args;
- va_start(args, format);
- int remaining_capacity = output->capacity - output->length;
- int bytes_written = vsnprintf(output->data + output->length,
- remaining_capacity, format, args);
- if (bytes_written > remaining_capacity) {
- gumbo_string_buffer_reserve(
- parser, output->capacity + bytes_written, output);
- remaining_capacity = output->capacity - output->length;
- bytes_written = vsnprintf(output->data + output->length,
- remaining_capacity, format, args);
- }
- output->length += bytes_written;
- va_end(args);
- return bytes_written;
- }
- static void print_tag_stack(
- GumboParser* parser, const GumboParserError* error,
- GumboStringBuffer* output) {
- print_message(parser, output, " Currently open tags: ");
- for (int i = 0; i < error->tag_stack.length; ++i) {
- if (i) {
- print_message(parser, output, ", ");
- }
- GumboTag tag = (GumboTag) error->tag_stack.data[i];
- print_message(parser, output, gumbo_normalized_tagname(tag));
- }
- gumbo_string_buffer_append_codepoint(parser, '.', output);
- }
- static void handle_parser_error(GumboParser* parser,
- const GumboParserError* error,
- GumboStringBuffer* output) {
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
- error->input_type != GUMBO_TOKEN_DOCTYPE) {
- print_message(parser, output,
- "The doctype must be the first token in the document");
- return;
- }
- switch (error->input_type) {
- case GUMBO_TOKEN_DOCTYPE:
- print_message(parser, output, "This is not a legal doctype");
- return;
- case GUMBO_TOKEN_COMMENT:
- // Should never happen; comments are always legal.
- assert(0);
- // But just in case...
- print_message(parser, output, "Comments aren't legal here");
- return;
- case GUMBO_TOKEN_WHITESPACE:
- case GUMBO_TOKEN_CHARACTER:
- print_message(parser, output, "Character tokens aren't legal here");
- return;
- case GUMBO_TOKEN_NULL:
- print_message(parser, output, "Null bytes are not allowed in HTML5");
- return;
- case GUMBO_TOKEN_EOF:
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
- print_message(parser, output, "You must provide a doctype");
- } else {
- print_message(parser, output, "Premature end of file");
- print_tag_stack(parser, error, output);
- }
- return;
- case GUMBO_TOKEN_START_TAG:
- case GUMBO_TOKEN_END_TAG:
- print_message(parser, output, "That tag isn't allowed here");
- print_tag_stack(parser, error, output);
- // TODO(jdtang): Give more specific messaging.
- return;
- }
- }
- // Finds the preceding newline in an original source buffer from a given byte
- // location. Returns a character pointer to the character after that, or a
- // pointer to the beginning of the string if this is the first line.
- static const char* find_last_newline(
- const char* original_text, const char* error_location) {
- assert(error_location >= original_text);
- const char* c = error_location;
- for (; c != original_text && *c != '\n'; --c) {
- // There may be an error at EOF, which would be a nul byte.
- assert(*c || c == error_location);
- }
- return c == original_text ? c : c + 1;
- }
- // Finds the next newline in the original source buffer from a given byte
- // location. Returns a character pointer to that newline, or a pointer to the
- // terminating null byte if this is the last line.
- static const char* find_next_newline(
- const char* original_text, const char* error_location) {
- const char* c = error_location;
- for (; *c && *c != '\n'; ++c);
- return c;
- }
- GumboError* gumbo_add_error(GumboParser* parser) {
- int max_errors = parser->_options->max_errors;
- if (max_errors >= 0 && parser->_output->errors.length >= max_errors) {
- return NULL;
- }
- GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
- gumbo_vector_add(parser, error, &parser->_output->errors);
- return error;
- }
- void gumbo_error_to_string(
- GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
- print_message(parser, output, "@%d:%d: ",
- error->position.line, error->position.column);
- switch (error->type) {
- case GUMBO_ERR_UTF8_INVALID:
- print_message(parser, output, "Invalid UTF8 character 0x%x",
- error->v.codepoint);
- break;
- case GUMBO_ERR_UTF8_TRUNCATED:
- print_message(parser, output,
- "Input stream ends with a truncated UTF8 character 0x%x",
- error->v.codepoint);
- break;
- case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
- print_message(parser, output,
- "No digits after &# in numeric character reference");
- break;
- case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
- print_message(parser, output,
- "The numeric character reference &#%d should be followed "
- "by a semicolon", error->v.codepoint);
- break;
- case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
- print_message(parser, output,
- "The numeric character reference &#%d; encodes an invalid "
- "unicode codepoint", error->v.codepoint);
- break;
- case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
- // The textual data came from one of the literal strings in the table, and
- // so it'll be null-terminated.
- print_message(parser, output,
- "The named character reference &%.*s should be followed by a "
- "semicolon", (int) error->v.text.length, error->v.text.data);
- break;
- case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
- print_message(parser, output,
- "The named character reference &%.*s; is not a valid entity name",
- (int) error->v.text.length, error->v.text.data);
- break;
- case GUMBO_ERR_DUPLICATE_ATTR:
- print_message(parser, output,
- "Attribute %s occurs multiple times, at positions %d and %d",
- error->v.duplicate_attr.name,
- error->v.duplicate_attr.original_index,
- error->v.duplicate_attr.new_index);
- break;
- case GUMBO_ERR_PARSER:
- case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
- handle_parser_error(parser, &error->v.parser, output);
- break;
- default:
- print_message(parser, output,
- "Tokenizer error with an unimplemented error message");
- break;
- }
- gumbo_string_buffer_append_codepoint(parser, '.', output);
- }
- void gumbo_caret_diagnostic_to_string(
- GumboParser* parser, const GumboError* error,
- const char* source_text, GumboStringBuffer* output) {
- gumbo_error_to_string(parser, error, output);
- const char* line_start =
- find_last_newline(source_text, error->original_text);
- const char* line_end =
- find_next_newline(source_text, error->original_text);
- GumboStringPiece original_line;
- original_line.data = line_start;
- original_line.length = line_end - line_start;
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
- gumbo_string_buffer_append_string(parser, &original_line, output);
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
- gumbo_string_buffer_reserve(
- parser, output->length + error->position.column, output);
- int num_spaces = error->position.column - 1;
- memset(output->data + output->length, ' ', num_spaces);
- output->length += num_spaces;
- gumbo_string_buffer_append_codepoint(parser, '^', output);
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
- }
- void gumbo_print_caret_diagnostic(
- GumboParser* parser, const GumboError* error, const char* source_text) {
- GumboStringBuffer text;
- gumbo_string_buffer_init(parser, &text);
- gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
- printf("%.*s", (int) text.length, text.data);
- gumbo_string_buffer_destroy(parser, &text);
- }
- void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
- if (error->type == GUMBO_ERR_PARSER ||
- error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
- gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
- } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
- gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
- }
- gumbo_parser_deallocate(parser, error);
- }
- void gumbo_init_errors(GumboParser* parser) {
- gumbo_vector_init(parser, 5, &parser->_output->errors);
- }
- void gumbo_destroy_errors(GumboParser* parser) {
- for (int i = 0; i < parser->_output->errors.length; ++i) {
- gumbo_error_destroy(parser, parser->_output->errors.data[i]);
- }
- gumbo_vector_destroy(parser, &parser->_output->errors);
- }
|