utf8_util.c 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. /* Copyright 2013 Google Inc. All Rights Reserved.
  2. Distributed under MIT license.
  3. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  4. */
  5. /* Heuristics for deciding about the UTF8-ness of strings. */
  6. #include "./utf8_util.h"
  7. #include "../common/types.h"
  8. #if defined(__cplusplus) || defined(c_plusplus)
  9. extern "C" {
  10. #endif
  11. static size_t BrotliParseAsUTF8(
  12. int* symbol, const uint8_t* input, size_t size) {
  13. /* ASCII */
  14. if ((input[0] & 0x80) == 0) {
  15. *symbol = input[0];
  16. if (*symbol > 0) {
  17. return 1;
  18. }
  19. }
  20. /* 2-byte UTF8 */
  21. if (size > 1u &&
  22. (input[0] & 0xe0) == 0xc0 &&
  23. (input[1] & 0xc0) == 0x80) {
  24. *symbol = (((input[0] & 0x1f) << 6) |
  25. (input[1] & 0x3f));
  26. if (*symbol > 0x7f) {
  27. return 2;
  28. }
  29. }
  30. /* 3-byte UFT8 */
  31. if (size > 2u &&
  32. (input[0] & 0xf0) == 0xe0 &&
  33. (input[1] & 0xc0) == 0x80 &&
  34. (input[2] & 0xc0) == 0x80) {
  35. *symbol = (((input[0] & 0x0f) << 12) |
  36. ((input[1] & 0x3f) << 6) |
  37. (input[2] & 0x3f));
  38. if (*symbol > 0x7ff) {
  39. return 3;
  40. }
  41. }
  42. /* 4-byte UFT8 */
  43. if (size > 3u &&
  44. (input[0] & 0xf8) == 0xf0 &&
  45. (input[1] & 0xc0) == 0x80 &&
  46. (input[2] & 0xc0) == 0x80 &&
  47. (input[3] & 0xc0) == 0x80) {
  48. *symbol = (((input[0] & 0x07) << 18) |
  49. ((input[1] & 0x3f) << 12) |
  50. ((input[2] & 0x3f) << 6) |
  51. (input[3] & 0x3f));
  52. if (*symbol > 0xffff && *symbol <= 0x10ffff) {
  53. return 4;
  54. }
  55. }
  56. /* Not UTF8, emit a special symbol above the UTF8-code space */
  57. *symbol = 0x110000 | input[0];
  58. return 1;
  59. }
  60. /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
  61. int BrotliIsMostlyUTF8(const uint8_t* data, const size_t pos,
  62. const size_t mask, const size_t length, const double min_fraction) {
  63. size_t size_utf8 = 0;
  64. size_t i = 0;
  65. while (i < length) {
  66. int symbol;
  67. size_t bytes_read =
  68. BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
  69. i += bytes_read;
  70. if (symbol < 0x110000) size_utf8 += bytes_read;
  71. }
  72. return (size_utf8 > min_fraction * (double)length) ? 1 : 0;
  73. }
  74. #if defined(__cplusplus) || defined(c_plusplus)
  75. } /* extern "C" */
  76. #endif