| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- /* Copyright 2013 Google Inc. All Rights Reserved.
- Distributed under MIT license.
- See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
- /* Heuristics for deciding about the UTF8-ness of strings. */
- #include "./utf8_util.h"
- #include "../common/types.h"
- #if defined(__cplusplus) || defined(c_plusplus)
- extern "C" {
- #endif
- static size_t BrotliParseAsUTF8(
- int* symbol, const uint8_t* input, size_t size) {
- /* ASCII */
- if ((input[0] & 0x80) == 0) {
- *symbol = input[0];
- if (*symbol > 0) {
- return 1;
- }
- }
- /* 2-byte UTF8 */
- if (size > 1u &&
- (input[0] & 0xe0) == 0xc0 &&
- (input[1] & 0xc0) == 0x80) {
- *symbol = (((input[0] & 0x1f) << 6) |
- (input[1] & 0x3f));
- if (*symbol > 0x7f) {
- return 2;
- }
- }
- /* 3-byte UFT8 */
- if (size > 2u &&
- (input[0] & 0xf0) == 0xe0 &&
- (input[1] & 0xc0) == 0x80 &&
- (input[2] & 0xc0) == 0x80) {
- *symbol = (((input[0] & 0x0f) << 12) |
- ((input[1] & 0x3f) << 6) |
- (input[2] & 0x3f));
- if (*symbol > 0x7ff) {
- return 3;
- }
- }
- /* 4-byte UFT8 */
- if (size > 3u &&
- (input[0] & 0xf8) == 0xf0 &&
- (input[1] & 0xc0) == 0x80 &&
- (input[2] & 0xc0) == 0x80 &&
- (input[3] & 0xc0) == 0x80) {
- *symbol = (((input[0] & 0x07) << 18) |
- ((input[1] & 0x3f) << 12) |
- ((input[2] & 0x3f) << 6) |
- (input[3] & 0x3f));
- if (*symbol > 0xffff && *symbol <= 0x10ffff) {
- return 4;
- }
- }
- /* Not UTF8, emit a special symbol above the UTF8-code space */
- *symbol = 0x110000 | input[0];
- return 1;
- }
- /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
- int BrotliIsMostlyUTF8(const uint8_t* data, const size_t pos,
- const size_t mask, const size_t length, const double min_fraction) {
- size_t size_utf8 = 0;
- size_t i = 0;
- while (i < length) {
- int symbol;
- size_t bytes_read =
- BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
- i += bytes_read;
- if (symbol < 0x110000) size_utf8 += bytes_read;
- }
- return (size_utf8 > min_fraction * (double)length) ? 1 : 0;
- }
- #if defined(__cplusplus) || defined(c_plusplus)
- } /* extern "C" */
- #endif
|