glob.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. #ifndef GLOB_H_
  2. #define GLOB_H_
  3. #include <stdint.h>
  4. #include <stdbool.h>
  5. #include <string.h>
  6. #if !defined(GLOB_MALLOC) && !defined(GLOB_FREE)
  7. #include <stdlib.h>
  8. #define GLOB_MALLOC malloc
  9. #define GLOB_FREE free
  10. #elif !defined(GLOB_MALLOC) || !defined(GLOB_FREE)
  11. #error "You must define both GLOB_MALLOC and GLOB_FREE"
  12. #endif
  13. // Matched - falsy
  14. // Not matched for any reason - truthy
  15. typedef enum {
  16. GLOB_OOM_ERROR = -4,
  17. GLOB_ENCODING_ERROR = -3,
  18. GLOB_SYNTAX_ERROR = -2,
  19. GLOB_UNMATCHED = -1,
  20. GLOB_MATCHED = 0,
  21. } Glob_Result;
  22. const char *glob_result_display(Glob_Result result);
  23. Glob_Result glob_utf8(const char *pattern, const char *text);
  24. Glob_Result glob_utf32(const uint32_t *pattern, const uint32_t *text);
  25. // TODO: implement glob_utf16
  26. // TODO: support for non-NULL-terminated (a.k.a sized) strings
  27. #endif // GLOB_H_
  28. #ifdef GLOB_IMPLEMENTATION
  29. // HERE STARTS ConvertUTF CODE //////////////////////////////
  30. /*
  31. * Copyright 2001-2004 Unicode, Inc.
  32. *
  33. * Disclaimer
  34. *
  35. * This source code is provided as is by Unicode, Inc. No claims are
  36. * made as to fitness for any particular purpose. No warranties of any
  37. * kind are expressed or implied. The recipient agrees to determine
  38. * applicability of information provided. If this file has been
  39. * purchased on magnetic or optical media from Unicode, Inc., the
  40. * sole remedy for any claim will be exchange of defective media
  41. * within 90 days of receipt.
  42. *
  43. * Limitations on Rights to Redistribute This Code
  44. *
  45. * Unicode, Inc. hereby grants the right to freely use the information
  46. * supplied in this file in the creation of products supporting the
  47. * Unicode Standard, and to make copies of this file in any form
  48. * for internal or external distribution as long as this notice
  49. * remains attached.
  50. */
  51. /* ---------------------------------------------------------------------
  52. The following 4 definitions are compiler-specific.
  53. The C standard does not guarantee that wchar_t has at least
  54. 16 bits, so wchar_t is no less portable than unsigned short!
  55. All should be unsigned values to avoid sign extension during
  56. bit mask & shift operations.
  57. ------------------------------------------------------------------------ */
  58. typedef unsigned int UTF32; /* at least 32 bits */
  59. typedef unsigned short UTF16; /* at least 16 bits */
  60. typedef unsigned char UTF8; /* typically 8 bits */
  61. typedef unsigned char Boolean; /* 0 or 1 */
  62. /* Some fundamental constants */
  63. #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
  64. #define UNI_MAX_BMP (UTF32)0x0000FFFF
  65. #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
  66. #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
  67. #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
  68. #define UNI_SUR_HIGH_START (UTF32)0xD800
  69. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  70. #define UNI_SUR_LOW_START (UTF32)0xDC00
  71. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  72. typedef enum {
  73. conversionOK, /* conversion successful */
  74. sourceExhausted, /* partial character in source, but hit end */
  75. targetExhausted, /* insuff. room in target for conversion */
  76. sourceIllegal /* source sequence is illegal/malformed */
  77. } ConversionResult;
  78. typedef enum {
  79. strictConversion = 0,
  80. lenientConversion
  81. } ConversionFlags;
  82. /*
  83. * Index into the table below with the first byte of a UTF-8 sequence to
  84. * get the number of trailing bytes that are supposed to follow it.
  85. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  86. * left as-is for anyone who may want to do such conversion, which was
  87. * allowed in earlier algorithms.
  88. */
  89. static const char trailingBytesForUTF8[256] = {
  90. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  91. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  92. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  93. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  94. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  95. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  96. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  97. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  98. };
  99. /*
  100. * Magic values subtracted from a buffer value during UTF8 conversion.
  101. * This table contains as many values as there might be trailing bytes
  102. * in a UTF-8 sequence.
  103. */
  104. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  105. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  106. /*
  107. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  108. * This must be called with the length pre-determined by the first byte.
  109. * If not calling this from ConvertUTF8to*, then the length can be set by:
  110. * length = trailingBytesForUTF8[*source]+1;
  111. * and the sequence is illegal right away if there aren't that many bytes
  112. * available.
  113. * If presented with a length > 4, this returns false. The Unicode
  114. * definition of UTF-8 goes up to 4-byte sequences.
  115. */
  116. static Boolean isLegalUTF8(const UTF8 *source, int length) {
  117. UTF8 a;
  118. const UTF8 *srcptr = source+length;
  119. switch (length) {
  120. default: return false;
  121. /* Everything else falls through when "true"... */
  122. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  123. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  124. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  125. switch (*source) {
  126. /* no fall-through in this inner switch */
  127. case 0xE0: if (a < 0xA0) return false; break;
  128. case 0xED: if (a > 0x9F) return false; break;
  129. case 0xF0: if (a < 0x90) return false; break;
  130. case 0xF4: if (a > 0x8F) return false; break;
  131. default: if (a < 0x80) return false;
  132. }
  133. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  134. }
  135. if (*source > 0xF4) return false;
  136. return true;
  137. }
  138. ConversionResult ConvertUTF8toUTF32 (
  139. const UTF8** sourceStart, const UTF8* sourceEnd,
  140. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
  141. ConversionResult result = conversionOK;
  142. const UTF8* source = *sourceStart;
  143. UTF32* target = *targetStart;
  144. while (source < sourceEnd) {
  145. UTF32 ch = 0;
  146. unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
  147. if (source + extraBytesToRead >= sourceEnd) {
  148. result = sourceExhausted; break;
  149. }
  150. /* Do this check whether lenient or strict */
  151. if (! isLegalUTF8(source, extraBytesToRead+1)) {
  152. result = sourceIllegal;
  153. break;
  154. }
  155. /*
  156. * The cases all fall through. See "Note A" below.
  157. */
  158. switch (extraBytesToRead) {
  159. case 5: ch += *source++; ch <<= 6;
  160. case 4: ch += *source++; ch <<= 6;
  161. case 3: ch += *source++; ch <<= 6;
  162. case 2: ch += *source++; ch <<= 6;
  163. case 1: ch += *source++; ch <<= 6;
  164. case 0: ch += *source++;
  165. }
  166. ch -= offsetsFromUTF8[extraBytesToRead];
  167. if (target >= targetEnd) {
  168. source -= (extraBytesToRead+1); /* Back up the source pointer! */
  169. result = targetExhausted; break;
  170. }
  171. if (ch <= UNI_MAX_LEGAL_UTF32) {
  172. /*
  173. * UTF-16 surrogate values are illegal in UTF-32, and anything
  174. * over Plane 17 (> 0x10FFFF) is illegal.
  175. */
  176. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  177. if (flags == strictConversion) {
  178. source -= (extraBytesToRead+1); /* return to the illegal value itself */
  179. result = sourceIllegal;
  180. break;
  181. } else {
  182. *target++ = UNI_REPLACEMENT_CHAR;
  183. }
  184. } else {
  185. *target++ = ch;
  186. }
  187. } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
  188. result = sourceIllegal;
  189. *target++ = UNI_REPLACEMENT_CHAR;
  190. }
  191. }
  192. *sourceStart = source;
  193. *targetStart = target;
  194. return result;
  195. }
  196. // HERE ENDS ConvertUTF CODE //////////////////////////////
  197. static Glob_Result decode_utf8_with_malloc(const char *in, uint32_t **out)
  198. {
  199. size_t n = strlen(in);
  200. *out = GLOB_MALLOC(sizeof(uint32_t)*(n + 1));
  201. if (*out == NULL) return GLOB_OOM_ERROR;
  202. memset(*out, 0, sizeof(uint32_t)*(n + 1));
  203. uint32_t *out_end = *out;
  204. ConversionResult result = ConvertUTF8toUTF32(
  205. (const UTF8**) &in, (const UTF8*) (in + n),
  206. (UTF32**) &out_end, (UTF32*) out_end + n, 0);
  207. if (result != conversionOK) return GLOB_ENCODING_ERROR;
  208. return 0;
  209. }
  210. const char *glob_result_display(Glob_Result result)
  211. {
  212. switch (result) {
  213. case GLOB_UNMATCHED: return "GLOB_UNMATCHED";
  214. case GLOB_MATCHED: return "GLOB_MATCHED";
  215. case GLOB_SYNTAX_ERROR: return "GLOB_SYNTAX_ERROR";
  216. case GLOB_ENCODING_ERROR: return "GLOB_ENCODING_ERROR";
  217. case GLOB_OOM_ERROR: return "GLOB_OOM_ERROR";
  218. }
  219. return NULL;
  220. }
  221. Glob_Result glob_utf8(const char *pattern, const char *text)
  222. {
  223. Glob_Result result = 0;
  224. uint32_t *pattern_utf32 = NULL;
  225. uint32_t *text_utf32 = NULL;
  226. result = decode_utf8_with_malloc(pattern, &pattern_utf32);
  227. if (result) goto defer;
  228. result = decode_utf8_with_malloc(text, &text_utf32);
  229. if (result) goto defer;
  230. result = glob_utf32(pattern_utf32, text_utf32);
  231. defer:
  232. GLOB_FREE(pattern_utf32);
  233. GLOB_FREE(text_utf32);
  234. return result;
  235. }
  236. Glob_Result glob_utf32(const uint32_t *pattern, const uint32_t *text)
  237. {
  238. while (*pattern != '\0' && *text != '\0') {
  239. switch (*pattern) {
  240. case '?': {
  241. pattern += 1;
  242. text += 1;
  243. }
  244. break;
  245. case '*': {
  246. Glob_Result result = glob_utf32(pattern + 1, text);
  247. if (result != GLOB_UNMATCHED) return result;
  248. text += 1;
  249. }
  250. break;
  251. case '[': {
  252. bool matched = false;
  253. bool negate = false;
  254. pattern += 1; // skipping [
  255. if (*pattern == '\0') return GLOB_SYNTAX_ERROR; // unclosed [
  256. if (*pattern == '!') {
  257. negate = true;
  258. pattern += 1;
  259. if (*pattern == '\0') return GLOB_SYNTAX_ERROR; // unclosed [
  260. }
  261. uint32_t prev = *pattern;
  262. matched |= prev == *text;
  263. pattern += 1;
  264. while (*pattern != ']' && *pattern != '\0') {
  265. switch (*pattern) {
  266. case '-': {
  267. pattern += 1;
  268. switch (*pattern) {
  269. case ']':
  270. matched |= '-' == *text;
  271. break;
  272. case '\0':
  273. return GLOB_SYNTAX_ERROR; // unclosed [
  274. default: {
  275. matched |= prev <= *text && *text <= *pattern;
  276. prev = *pattern;
  277. pattern += 1;
  278. }
  279. }
  280. }
  281. break;
  282. default: {
  283. prev = *pattern;
  284. matched |= prev == *text;
  285. pattern += 1;
  286. }
  287. }
  288. }
  289. if (*pattern != ']') return GLOB_SYNTAX_ERROR; // unclosed [
  290. if (negate) matched = !matched;
  291. if (!matched) return GLOB_UNMATCHED;
  292. pattern += 1;
  293. text += 1;
  294. }
  295. break;
  296. case '\\':
  297. pattern += 1;
  298. if (*pattern == '\0') return GLOB_SYNTAX_ERROR; // unfinished escape
  299. // fallthrough
  300. default: {
  301. if (*pattern == *text) {
  302. pattern += 1;
  303. text += 1;
  304. } else {
  305. return GLOB_UNMATCHED;
  306. }
  307. }
  308. }
  309. }
  310. if (*text == '\0') {
  311. while (*pattern == '*') pattern += 1;
  312. if (*pattern == '\0') return GLOB_MATCHED;
  313. }
  314. return GLOB_UNMATCHED;
  315. }
  316. #endif // GLOB_IMPLEMENTATION