| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- #pragma warning(push)
- #pragma warning(disable: 4245)
- extern "C" {
- #include "utf8proc/utf8proc.c"
- }
- #pragma warning(pop)
- bool rune_is_letter(Rune r) {
- if (r < 0x80) {
- if (r == '_') {
- return true;
- }
- return ((cast(u32)r | 0x20) - 0x61) < 26;
- }
- switch (utf8proc_category(r)) {
- case UTF8PROC_CATEGORY_LU:
- case UTF8PROC_CATEGORY_LL:
- case UTF8PROC_CATEGORY_LT:
- case UTF8PROC_CATEGORY_LM:
- case UTF8PROC_CATEGORY_LO:
- return true;
- }
- return false;
- }
- bool rune_is_digit(Rune r) {
- if (r < 0x80) {
- return (cast(u32)r - '0') < 10;
- }
- return utf8proc_category(r) == UTF8PROC_CATEGORY_ND;
- }
- bool rune_is_letter_or_digit(Rune r) {
- if (r < 0x80) {
- if (r == '_') {
- return true;
- }
- if (((cast(u32)r | 0x20) - 0x61) < 26) {
- return true;
- }
- return (cast(u32)r - '0') < 10;
- }
- switch (utf8proc_category(r)) {
- case UTF8PROC_CATEGORY_LU:
- case UTF8PROC_CATEGORY_LL:
- case UTF8PROC_CATEGORY_LT:
- case UTF8PROC_CATEGORY_LM:
- case UTF8PROC_CATEGORY_LO:
- return true;
- case UTF8PROC_CATEGORY_ND:
- return true;
- }
- return false;
- }
- bool rune_is_whitespace(Rune r) {
- switch (r) {
- case ' ':
- case '\t':
- case '\n':
- case '\r':
- return true;
- }
- return false;
- }
- gb_global u8 const global__utf8_first[256] = {
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6F
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7F
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8F
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9F
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xA0-0xAF
- 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xB0-0xBF
- 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xC0-0xCF
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xD0-0xDF
- 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xE0-0xEF
- 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xF0-0xFF
- };
- typedef struct Utf8AcceptRange {
- u8 lo, hi;
- } Utf8AcceptRange;
- gb_global Utf8AcceptRange const global__utf8_accept_ranges[] = {
- {0x80, 0xbf},
- {0xa0, 0xbf},
- {0x80, 0x9f},
- {0x90, 0xbf},
- {0x80, 0x8f},
- };
- isize utf8_decode(u8 const *str, isize str_len, Rune *codepoint_out) {
- isize width = 0;
- Rune codepoint = GB_RUNE_INVALID;
- if (str_len > 0) {
- u8 s0 = str[0];
- u8 x = global__utf8_first[s0], sz;
- u8 b1, b2, b3;
- Utf8AcceptRange accept;
- if (x >= 0xf0) {
- Rune mask = (cast(Rune)x << 31) >> 31;
- codepoint = (cast(Rune)s0 & (~mask)) | (GB_RUNE_INVALID & mask);
- width = 1;
- goto end;
- }
- if (s0 < 0x80) {
- codepoint = s0;
- width = 1;
- goto end;
- }
- sz = x&7;
- accept = global__utf8_accept_ranges[x>>4];
- if (str_len < gb_size_of(sz))
- goto invalid_codepoint;
- b1 = str[1];
- if (b1 < accept.lo || accept.hi < b1)
- goto invalid_codepoint;
- if (sz == 2) {
- codepoint = (cast(Rune)s0&0x1f)<<6 | (cast(Rune)b1&0x3f);
- width = 2;
- goto end;
- }
- b2 = str[2];
- if (!gb_is_between(b2, 0x80, 0xbf))
- goto invalid_codepoint;
- if (sz == 3) {
- codepoint = (cast(Rune)s0&0x1f)<<12 | (cast(Rune)b1&0x3f)<<6 | (cast(Rune)b2&0x3f);
- width = 3;
- goto end;
- }
- b3 = str[3];
- if (!gb_is_between(b3, 0x80, 0xbf))
- goto invalid_codepoint;
- codepoint = (cast(Rune)s0&0x07)<<18 | (cast(Rune)b1&0x3f)<<12 | (cast(Rune)b2&0x3f)<<6 | (cast(Rune)b3&0x3f);
- width = 4;
- goto end;
- invalid_codepoint:
- codepoint = GB_RUNE_INVALID;
- width = 1;
- }
- end:
- if (codepoint_out) *codepoint_out = codepoint;
- return width;
- }
|