|
@@ -65,3 +65,100 @@ bool rune_is_whitespace(Rune r) {
|
|
|
}
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
+gb_global u8 const global__utf8_first[256] = {
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6F
|
|
|
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7F
|
|
|
+ 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8F
|
|
|
+ 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9F
|
|
|
+ 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xA0-0xAF
|
|
|
+ 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xB0-0xBF
|
|
|
+ 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xC0-0xCF
|
|
|
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xD0-0xDF
|
|
|
+ 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xE0-0xEF
|
|
|
+ 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xF0-0xFF
|
|
|
+};
|
|
|
+
|
|
|
+typedef struct Utf8AcceptRange {
|
|
|
+ u8 lo, hi;
|
|
|
+} Utf8AcceptRange;
|
|
|
+
|
|
|
+gb_global Utf8AcceptRange const global__utf8_accept_ranges[] = {
|
|
|
+ {0x80, 0xbf},
|
|
|
+ {0xa0, 0xbf},
|
|
|
+ {0x80, 0x9f},
|
|
|
+ {0x90, 0xbf},
|
|
|
+ {0x80, 0x8f},
|
|
|
+};
|
|
|
+
|
|
|
+
|
|
|
+isize utf8_decode(u8 const *str, isize str_len, Rune *codepoint_out) {
|
|
|
+ isize width = 0;
|
|
|
+ Rune codepoint = GB_RUNE_INVALID;
|
|
|
+
|
|
|
+ if (str_len > 0) {
|
|
|
+ u8 s0 = str[0];
|
|
|
+ u8 x = global__utf8_first[s0], sz;
|
|
|
+ u8 b1, b2, b3;
|
|
|
+ Utf8AcceptRange accept;
|
|
|
+ if (x >= 0xf0) {
|
|
|
+ Rune mask = (cast(Rune)x << 31) >> 31;
|
|
|
+ codepoint = (cast(Rune)s0 & (~mask)) | (GB_RUNE_INVALID & mask);
|
|
|
+ width = 1;
|
|
|
+ goto end;
|
|
|
+ }
|
|
|
+ if (s0 < 0x80) {
|
|
|
+ codepoint = s0;
|
|
|
+ width = 1;
|
|
|
+ goto end;
|
|
|
+ }
|
|
|
+
|
|
|
+ sz = x&7;
|
|
|
+ accept = global__utf8_accept_ranges[x>>4];
|
|
|
+ if (str_len < gb_size_of(sz))
|
|
|
+ goto invalid_codepoint;
|
|
|
+
|
|
|
+ b1 = str[1];
|
|
|
+ if (b1 < accept.lo || accept.hi < b1)
|
|
|
+ goto invalid_codepoint;
|
|
|
+
|
|
|
+ if (sz == 2) {
|
|
|
+ codepoint = (cast(Rune)s0&0x1f)<<6 | (cast(Rune)b1&0x3f);
|
|
|
+ width = 2;
|
|
|
+ goto end;
|
|
|
+ }
|
|
|
+
|
|
|
+ b2 = str[2];
|
|
|
+ if (!gb_is_between(b2, 0x80, 0xbf))
|
|
|
+ goto invalid_codepoint;
|
|
|
+
|
|
|
+ if (sz == 3) {
|
|
|
+ codepoint = (cast(Rune)s0&0x1f)<<12 | (cast(Rune)b1&0x3f)<<6 | (cast(Rune)b2&0x3f);
|
|
|
+ width = 3;
|
|
|
+ goto end;
|
|
|
+ }
|
|
|
+
|
|
|
+ b3 = str[3];
|
|
|
+ if (!gb_is_between(b3, 0x80, 0xbf))
|
|
|
+ goto invalid_codepoint;
|
|
|
+
|
|
|
+ codepoint = (cast(Rune)s0&0x07)<<18 | (cast(Rune)b1&0x3f)<<12 | (cast(Rune)b2&0x3f)<<6 | (cast(Rune)b3&0x3f);
|
|
|
+ width = 4;
|
|
|
+ goto end;
|
|
|
+
|
|
|
+ invalid_codepoint:
|
|
|
+ codepoint = GB_RUNE_INVALID;
|
|
|
+ width = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+end:
|
|
|
+ if (codepoint_out) *codepoint_out = codepoint;
|
|
|
+ return width;
|
|
|
+}
|