|
@@ -2087,11 +2087,6 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
|
|
|
return ERR_INVALID_DATA;
|
|
|
}
|
|
|
|
|
|
- String aux;
|
|
|
-
|
|
|
- int cstr_size = 0;
|
|
|
- int str_size = 0;
|
|
|
-
|
|
|
/* HANDLE BOM (Byte Order Mark) */
|
|
|
if (p_len < 0 || p_len >= 3) {
|
|
|
bool has_bom = uint8_t(p_utf8[0]) == 0xef && uint8_t(p_utf8[1]) == 0xbb && uint8_t(p_utf8[2]) == 0xbf;
|
|
@@ -2104,162 +2099,160 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- bool decode_error = false;
|
|
|
- bool decode_failed = false;
|
|
|
- {
|
|
|
- const char *ptrtmp = p_utf8;
|
|
|
- const char *ptrtmp_limit = p_len >= 0 ? &p_utf8[p_len] : nullptr;
|
|
|
- int skip = 0;
|
|
|
- uint8_t c_start = 0;
|
|
|
- while (ptrtmp != ptrtmp_limit && *ptrtmp) {
|
|
|
-#if CHAR_MIN == 0
|
|
|
- uint8_t c = *ptrtmp;
|
|
|
-#else
|
|
|
- uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
|
|
|
-#endif
|
|
|
-
|
|
|
- if (skip == 0) {
|
|
|
- if (p_skip_cr && c == '\r') {
|
|
|
- ptrtmp++;
|
|
|
- continue;
|
|
|
- }
|
|
|
- /* Determine the number of characters in sequence */
|
|
|
- if ((c & 0x80) == 0) {
|
|
|
- skip = 0;
|
|
|
- } else if ((c & 0xe0) == 0xc0) {
|
|
|
- skip = 1;
|
|
|
- } else if ((c & 0xf0) == 0xe0) {
|
|
|
- skip = 2;
|
|
|
- } else if ((c & 0xf8) == 0xf0) {
|
|
|
- skip = 3;
|
|
|
- } else if ((c & 0xfc) == 0xf8) {
|
|
|
- skip = 4;
|
|
|
- } else if ((c & 0xfe) == 0xfc) {
|
|
|
- skip = 5;
|
|
|
- } else {
|
|
|
- skip = 0;
|
|
|
- print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
|
|
|
- decode_failed = true;
|
|
|
- }
|
|
|
- c_start = c;
|
|
|
+ if (p_len < 0) {
|
|
|
+ p_len = strlen(p_utf8);
|
|
|
+ }
|
|
|
|
|
|
- if (skip == 1 && (c & 0x1e) == 0) {
|
|
|
- print_unicode_error(vformat("Overlong encoding (%x ...)", c));
|
|
|
- decode_error = true;
|
|
|
- }
|
|
|
- str_size++;
|
|
|
- } else {
|
|
|
- if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) {
|
|
|
- print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c));
|
|
|
- decode_error = true;
|
|
|
- }
|
|
|
- if (c < 0x80 || c > 0xbf) {
|
|
|
- print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true);
|
|
|
- decode_failed = true;
|
|
|
- skip = 0;
|
|
|
- } else {
|
|
|
- --skip;
|
|
|
- }
|
|
|
- }
|
|
|
+ // If all utf8 characters maps to ASCII, then the max size will be p_len, and we add +1 for the null termination.
|
|
|
+ resize(p_len + 1);
|
|
|
+ char32_t *dst = ptrw();
|
|
|
|
|
|
- cstr_size++;
|
|
|
- ptrtmp++;
|
|
|
- }
|
|
|
+ Error result = Error::OK;
|
|
|
|
|
|
- if (skip) {
|
|
|
- print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true);
|
|
|
- decode_failed = true;
|
|
|
- }
|
|
|
- }
|
|
|
+ const uint8_t *ptrtmp = (uint8_t *)p_utf8;
|
|
|
+ const uint8_t *ptr_limit = (uint8_t *)p_utf8 + p_len;
|
|
|
|
|
|
- if (str_size == 0) {
|
|
|
- clear();
|
|
|
- return OK; // empty string
|
|
|
- }
|
|
|
+ while (ptrtmp < ptr_limit && *ptrtmp) {
|
|
|
+ uint8_t c = *ptrtmp;
|
|
|
|
|
|
- resize(str_size + 1);
|
|
|
- char32_t *dst = ptrw();
|
|
|
- dst[str_size] = 0;
|
|
|
-
|
|
|
- int skip = 0;
|
|
|
- uint32_t unichar = 0;
|
|
|
- while (cstr_size) {
|
|
|
-#if CHAR_MIN == 0
|
|
|
- uint8_t c = *p_utf8;
|
|
|
-#else
|
|
|
- uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
|
|
|
-#endif
|
|
|
+ if (p_skip_cr && c == '\r') {
|
|
|
+ ++ptrtmp;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ uint32_t unicode = _replacement_char;
|
|
|
+ uint32_t size = 1;
|
|
|
|
|
|
- if (skip == 0) {
|
|
|
- if (p_skip_cr && c == '\r') {
|
|
|
- p_utf8++;
|
|
|
- continue;
|
|
|
+ if ((c & 0b10000000) == 0) {
|
|
|
+ unicode = c;
|
|
|
+ if (unicode > 0x7F) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
}
|
|
|
- /* Determine the number of characters in sequence */
|
|
|
- if ((c & 0x80) == 0) {
|
|
|
- *(dst++) = c;
|
|
|
- unichar = 0;
|
|
|
- skip = 0;
|
|
|
- } else if ((c & 0xe0) == 0xc0) {
|
|
|
- unichar = (0xff >> 3) & c;
|
|
|
- skip = 1;
|
|
|
- } else if ((c & 0xf0) == 0xe0) {
|
|
|
- unichar = (0xff >> 4) & c;
|
|
|
- skip = 2;
|
|
|
- } else if ((c & 0xf8) == 0xf0) {
|
|
|
- unichar = (0xff >> 5) & c;
|
|
|
- skip = 3;
|
|
|
- } else if ((c & 0xfc) == 0xf8) {
|
|
|
- unichar = (0xff >> 6) & c;
|
|
|
- skip = 4;
|
|
|
- } else if ((c & 0xfe) == 0xfc) {
|
|
|
- unichar = (0xff >> 7) & c;
|
|
|
- skip = 5;
|
|
|
+ } else if ((c & 0b11100000) == 0b11000000) {
|
|
|
+ if (ptrtmp + 1 >= ptr_limit) {
|
|
|
+ print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
} else {
|
|
|
- *(dst++) = _replacement_char;
|
|
|
- unichar = 0;
|
|
|
- skip = 0;
|
|
|
+ uint8_t c2 = *(ptrtmp + 1);
|
|
|
+
|
|
|
+ if ((c2 & 0b11000000) == 0b10000000) {
|
|
|
+ unicode = (uint32_t)((c & 0b00011111) << 6) | (uint32_t)(c2 & 0b00111111);
|
|
|
+
|
|
|
+ if (unicode < 0x80) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Overlong encoding (%x %x)", c, c2));
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else if (unicode > 0x7FF) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else {
|
|
|
+ size = 2;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ }
|
|
|
}
|
|
|
- } else {
|
|
|
- if (c < 0x80 || c > 0xbf) {
|
|
|
- *(dst++) = _replacement_char;
|
|
|
- skip = 0;
|
|
|
+ } else if ((c & 0b11110000) == 0b11100000) {
|
|
|
+ uint32_t range_min = (c == 0xE0) ? 0xA0 : 0x80;
|
|
|
+ uint32_t range_max = (c == 0xED) ? 0x9F : 0xBF;
|
|
|
+ uint8_t c2 = (ptrtmp + 1) < ptr_limit ? *(ptrtmp + 1) : 0;
|
|
|
+ uint8_t c3 = (ptrtmp + 2) < ptr_limit ? *(ptrtmp + 2) : 0;
|
|
|
+ bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
|
|
|
+ bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
|
|
|
+
|
|
|
+ if (c2_valid && c3_valid) {
|
|
|
+ unicode = (uint32_t)((c & 0b00001111) << 12) | (uint32_t)((c2 & 0b00111111) << 6) | (uint32_t)(c3 & 0b00111111);
|
|
|
+
|
|
|
+ if (unicode < 0x800) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Overlong encoding (%x %x %x)", c, c2, c3));
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else if (unicode > 0xFFFF) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else {
|
|
|
+ size = 3;
|
|
|
+ }
|
|
|
} else {
|
|
|
- unichar = (unichar << 6) | (c & 0x3f);
|
|
|
- --skip;
|
|
|
- if (skip == 0) {
|
|
|
- if (unichar == 0) {
|
|
|
- print_unicode_error("NUL character", true);
|
|
|
- decode_failed = true;
|
|
|
- unichar = _replacement_char;
|
|
|
- } else if ((unichar & 0xfffff800) == 0xd800) {
|
|
|
- print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
|
|
|
- decode_failed = true;
|
|
|
- unichar = _replacement_char;
|
|
|
- } else if (unichar > 0x10ffff) {
|
|
|
- print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
|
|
|
- decode_failed = true;
|
|
|
- unichar = _replacement_char;
|
|
|
- }
|
|
|
- *(dst++) = unichar;
|
|
|
+ if (c2 == 0) {
|
|
|
+ print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
|
|
|
+ } else if (c2_valid == false) {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
|
|
|
+ } else if (c3 == 0) {
|
|
|
+ print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
|
|
|
+ } else {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
|
|
|
+ // The unicode specification, in paragraphe 3.9 "Unicode Encoding Forms" Conformance
|
|
|
+ // state : "Only when a sequence of two or three bytes is a truncated version of a sequence which is
|
|
|
+ // otherwise well-formed to that point, is more than one byte replaced with a single U+FFFD"
|
|
|
+ // So here we replace the first 2 bytes with one single replacement_char.
|
|
|
+ size = 2;
|
|
|
}
|
|
|
+
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
}
|
|
|
+ } else if ((c & 0b11111000) == 0b11110000) {
|
|
|
+ uint32_t range_min = (c == 0xF0) ? 0x90 : 0x80;
|
|
|
+ uint32_t range_max = (c == 0xF4) ? 0x8F : 0xBF;
|
|
|
+
|
|
|
+ uint8_t c2 = ((ptrtmp + 1) < ptr_limit) ? *(ptrtmp + 1) : 0;
|
|
|
+ uint8_t c3 = ((ptrtmp + 2) < ptr_limit) ? *(ptrtmp + 2) : 0;
|
|
|
+ uint8_t c4 = ((ptrtmp + 3) < ptr_limit) ? *(ptrtmp + 3) : 0;
|
|
|
+
|
|
|
+ bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
|
|
|
+ bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
|
|
|
+ bool c4_valid = c4 && ((c4 & 0b11000000) == 0b10000000);
|
|
|
+
|
|
|
+ if (c2_valid && c3_valid && c4_valid) {
|
|
|
+ unicode = (uint32_t)((c & 0b00000111) << 18) | (uint32_t)((c2 & 0b00111111) << 12) | (uint32_t)((c3 & 0b00111111) << 6) | (uint32_t)(c4 & 0b00111111);
|
|
|
+
|
|
|
+ if (unicode < 0x10000) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Overlong encoding (%x %x %x %x)", c, c2, c3, c4));
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else if (unicode > 0x10FFFF) {
|
|
|
+ unicode = _replacement_char;
|
|
|
+ print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ } else {
|
|
|
+ size = 4;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if (c2 == 0) {
|
|
|
+ print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
|
|
|
+ } else if (c2_valid == false) {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
|
|
|
+ } else if (c3 == 0) {
|
|
|
+ print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
|
|
|
+ } else if (c3_valid == false) {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
|
|
|
+ size = 2;
|
|
|
+ } else if (c4 == 0) {
|
|
|
+ print_unicode_error(vformat("Missing %x %x %x UTF-8 continuation byte", c, c2, c3), true);
|
|
|
+ } else {
|
|
|
+ print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x %x", c4, c, c2, c3));
|
|
|
+ size = 3;
|
|
|
+ }
|
|
|
+
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
|
|
|
+ result = Error::ERR_INVALID_DATA;
|
|
|
}
|
|
|
|
|
|
- cstr_size--;
|
|
|
- p_utf8++;
|
|
|
- }
|
|
|
- if (skip) {
|
|
|
- *(dst++) = 0x20;
|
|
|
+ (*dst++) = unicode;
|
|
|
+ ptrtmp += size;
|
|
|
}
|
|
|
|
|
|
- if (decode_failed) {
|
|
|
- return ERR_INVALID_DATA;
|
|
|
- } else if (decode_error) {
|
|
|
- return ERR_PARSE_ERROR;
|
|
|
- } else {
|
|
|
- return OK;
|
|
|
- }
|
|
|
+ (*dst++) = 0;
|
|
|
+ resize(dst - ptr());
|
|
|
+
|
|
|
+ return result;
|
|
|
}
|
|
|
|
|
|
CharString String::utf8() const {
|