瀏覽代碼

Bake ConvertUTF into glob.h

rexim 2 年之前
父節點
當前提交
ba954ad88d
共有 5 個文件被更改,包括 253 次插入61 次删除
  1. 3 4
      README.md
  2. 1 1
      build.sh
  3. 1 1
      coverage.sh
  4. 247 16
      glob.h
  5. 1 39
      test_glob.c

+ 3 - 4
README.md

@@ -3,15 +3,14 @@
 ## Usage
 
 ```c
+#include <stdio.h>
+#include <string.h>
 #define GLOB_IMPLEMENTATION
 #include "glob.h"
 
-// Your custom UTF8 decode. Check test_glob.c for an example of that.
-uint32_t *decode_utf8(const char *message);
-
 int main(void)
 {
-    if (glob(decode_utf8("*.c"), decode_utf8("main.c")) == GLOB_MATCHED) {
+    if (!glob_utf8("*.c", "main.c")) {
         printf("OK\n");
     } else {
         printf("FAIL\n");

+ 1 - 1
build.sh

@@ -2,4 +2,4 @@
 
 set -xe
 
-clang -Wall -Wextra -Wswitch-enum -ggdb -o test_glob test_glob.c ConvertUTF.c
+clang -Wall -Wextra -Wswitch-enum -ggdb -o test_glob test_glob.c

+ 1 - 1
coverage.sh

@@ -4,7 +4,7 @@ set -xe
 
 # Source-based Code Coverage with Clang: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
 
-clang -Wall -Wextra -Wswitch-enum -fprofile-instr-generate -fcoverage-mapping -ggdb -o test_glob test_glob.c ConvertUTF.c
+clang -Wall -Wextra -Wswitch-enum -fprofile-instr-generate -fcoverage-mapping -ggdb -o test_glob test_glob.c
 ./test_glob
 llvm-profdata merge -sparse ./default.profraw -o default.profdata
 llvm-cov show ./test_glob -instr-profile=default.profdata glob.h

+ 247 - 16
glob.h

@@ -5,34 +5,265 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#if !defined(GLOB_MALLOC) && !defined(GLOB_FREE)
+#include <stdlib.h>
+#define GLOB_MALLOC malloc
+#define GLOB_FREE free
+#else
+#error "You must define both GLOB_MALLOC and GLOB_FREE"
+#endif
+
+// Matched - falsy
+// Not matched for any reason - truthy
 typedef enum {
-    GLOB_UNMATCHED = 0,
-    GLOB_MATCHED,
-    GLOB_SYNTAX_ERROR,
+    GLOB_ENCODING_ERROR = -3,
+    GLOB_SYNTAX_ERROR   = -2,
+    GLOB_UNMATCHED      = -1,
+    GLOB_MATCHED        =  0,
 } Glob_Result;
 
 const char *glob_result_display(Glob_Result result);
-Glob_Result glob(const uint32_t *pattern, const uint32_t *text);
+Glob_Result glob_utf8(const char *pattern, const char *text);
+Glob_Result glob_utf32(const uint32_t *pattern, const uint32_t *text);
+// TODO: implement glob_utf16
 
 #endif // GLOB_H_
 
 #ifdef GLOB_IMPLEMENTATION
 
+// HERE STARTS ConvertUTF CODE //////////////////////////////
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/* ---------------------------------------------------------------------
+    The following 4 definitions are compiler-specific.
+    The C standard does not guarantee that wchar_t has at least
+    16 bits, so wchar_t is no less portable than unsigned short!
+    All should be unsigned values to avoid sign extension during
+    bit mask & shift operations.
+------------------------------------------------------------------------ */
+
+typedef unsigned int	UTF32;	/* at least 32 bits */
+typedef unsigned short	UTF16;	/* at least 16 bits */
+typedef unsigned char	UTF8;	/* typically 8 bits */
+typedef unsigned char	Boolean; /* 0 or 1 */
+
+/* Some fundamental constants */
+#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
+#define UNI_MAX_BMP (UTF32)0x0000FFFF
+#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
+#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
+
+#define UNI_SUR_HIGH_START  (UTF32)0xD800
+#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
+#define UNI_SUR_LOW_START   (UTF32)0xDC00
+#define UNI_SUR_LOW_END     (UTF32)0xDFFF
+
+typedef enum {
+	conversionOK, 		/* conversion successful */
+	sourceExhausted,	/* partial character in source, but hit end */
+	targetExhausted,	/* insuff. room in target for conversion */
+	sourceIllegal		/* source sequence is illegal/malformed */
+} ConversionResult;
+
+typedef enum {
+	strictConversion = 0,
+	lenientConversion
+} ConversionFlags;
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
+ */
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/*
+ * Magic values subtracted from a buffer value during UTF8 conversion.
+ * This table contains as many values as there might be trailing bytes
+ * in a UTF-8 sequence.
+ */
+static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
+		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * If not calling this from ConvertUTF8to*, then the length can be set by:
+ *  length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns false.  The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
+    UTF8 a;
+    const UTF8 *srcptr = source+length;
+    switch (length) {
+    default: return false;
+	/* Everything else falls through when "true"... */
+    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+	switch (*source) {
+	    /* no fall-through in this inner switch */
+	    case 0xE0: if (a < 0xA0) return false; break;
+	    case 0xED: if (a > 0x9F) return false; break;
+	    case 0xF0: if (a < 0x90) return false; break;
+	    case 0xF4: if (a > 0x8F) return false; break;
+	    default:   if (a < 0x80) return false;
+	}
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4) return false;
+    return true;
+}
+
+ConversionResult ConvertUTF8toUTF32 (
+	const UTF8** sourceStart, const UTF8* sourceEnd, 
+	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF8* source = *sourceStart;
+    UTF32* target = *targetStart;
+    while (source < sourceEnd) {
+	UTF32 ch = 0;
+	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+	if (source + extraBytesToRead >= sourceEnd) {
+	    result = sourceExhausted; break;
+	}
+	/* Do this check whether lenient or strict */
+	if (! isLegalUTF8(source, extraBytesToRead+1)) {
+	    result = sourceIllegal;
+	    break;
+	}
+	/*
+	 * The cases all fall through. See "Note A" below.
+	 */
+	switch (extraBytesToRead) {
+	    case 5: ch += *source++; ch <<= 6;
+	    case 4: ch += *source++; ch <<= 6;
+	    case 3: ch += *source++; ch <<= 6;
+	    case 2: ch += *source++; ch <<= 6;
+	    case 1: ch += *source++; ch <<= 6;
+	    case 0: ch += *source++;
+	}
+	ch -= offsetsFromUTF8[extraBytesToRead];
+
+	if (target >= targetEnd) {
+	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
+	    result = targetExhausted; break;
+	}
+	if (ch <= UNI_MAX_LEGAL_UTF32) {
+	    /*
+	     * UTF-16 surrogate values are illegal in UTF-32, and anything
+	     * over Plane 17 (> 0x10FFFF) is illegal.
+	     */
+	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+		if (flags == strictConversion) {
+		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
+		    result = sourceIllegal;
+		    break;
+		} else {
+		    *target++ = UNI_REPLACEMENT_CHAR;
+		}
+	    } else {
+		*target++ = ch;
+	    }
+	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+	    result = sourceIllegal;
+	    *target++ = UNI_REPLACEMENT_CHAR;
+	}
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+// HERE ENDS ConvertUTF CODE //////////////////////////////
+
+static ConversionResult decode_utf8_with_malloc(const char *in, uint32_t **out)
+{
+    size_t n = strlen(in);
+    *out = GLOB_MALLOC(sizeof(uint32_t)*(n + 1));
+    assert(*out != NULL && "Buy more RAM lol");
+    memset(*out, 0, sizeof(uint32_t)*(n + 1));
+    uint32_t *out_end = *out;
+
+    ConversionResult result = ConvertUTF8toUTF32(
+                                  (const UTF8**) &in, (const UTF8*) (in + n),
+                                  (UTF32**) &out_end, (UTF32*) out_end + n, 0);
+    return result;
+}
+
 const char *glob_result_display(Glob_Result result)
 {
     switch (result) {
-    case GLOB_UNMATCHED:
-        return "GLOB_UNMATCHED";
-    case GLOB_MATCHED:
-        return "GLOB_MATCHED";
-    case GLOB_SYNTAX_ERROR:
-        return "GLOB_SYNTAX_ERROR";
-    default:
-        assert(0 && "unreachable");
+    case GLOB_UNMATCHED:      return "GLOB_UNMATCHED";
+    case GLOB_MATCHED:        return "GLOB_MATCHED";
+    case GLOB_SYNTAX_ERROR:   return "GLOB_SYNTAX_ERROR";
+    case GLOB_ENCODING_ERROR: return "GLOB_ENCODING_ERROR";
+    default: assert(0 && "unreachable");
     }
 }
 
-Glob_Result glob(const uint32_t *pattern, const uint32_t *text)
+Glob_Result glob_utf8(const char *pattern, const char *text)
+{
+    Glob_Result result = 0;
+
+    uint32_t *pattern_utf32 = NULL;
+    uint32_t *text_utf32 = NULL;
+    if (decode_utf8_with_malloc(pattern, &pattern_utf32) != conversionOK) {
+        result = GLOB_ENCODING_ERROR;
+        goto defer;
+    }
+    if (decode_utf8_with_malloc(text, &text_utf32) != conversionOK) {
+        result = GLOB_ENCODING_ERROR;
+        goto defer;
+    }
+
+    result = glob_utf32(pattern_utf32, text_utf32);
+
+defer:
+    GLOB_FREE(pattern_utf32);
+    GLOB_FREE(text_utf32);
+    return result;
+}
+
+Glob_Result glob_utf32(const uint32_t *pattern, const uint32_t *text)
 {
     while (*pattern != '\0' && *text != '\0') {
         switch (*pattern) {
@@ -43,8 +274,8 @@ Glob_Result glob(const uint32_t *pattern, const uint32_t *text)
         break;
 
         case '*': {
-            Glob_Result result = glob(pattern + 1, text);
-            if (result) return result;
+            Glob_Result result = glob_utf32(pattern + 1, text);
+            if (result != GLOB_UNMATCHED) return result;
             text += 1;
         }
         break;
@@ -118,7 +349,7 @@ Glob_Result glob(const uint32_t *pattern, const uint32_t *text)
 
     if (*text == '\0') {
         while (*pattern == '*') pattern += 1;
-        return *pattern == '\0';
+        if (*pattern == '\0') return GLOB_MATCHED;
     }
 
     return GLOB_UNMATCHED;

+ 1 - 39
test_glob.c

@@ -5,55 +5,17 @@
 #include <stdbool.h>
 #include <string.h>
 
-#include "ConvertUTF.h"
 #define GLOB_IMPLEMENTATION
 #include "glob.h"
 
-uint32_t *decode_utf8(const char *message)
-{
-    size_t n = strlen(message);
-    uint32_t *out = malloc(sizeof(*out)*(n + 1));
-    assert(out != NULL && "Buy more RAM lol");
-    memset(out, 0, sizeof(*out)*(n + 1));
-    uint32_t *out_end = out;
-
-    ConversionResult result = ConvertUTF8toUTF32(
-                                  (const UTF8**) &message, (const UTF8*) (message + n),
-                                  (UTF32**) &out_end, (UTF32*) out + n, 0);
-    switch (result) {
-        case conversionOK: return out;
-        case sourceExhausted: {
-            free(out);
-            fprintf(stderr, "ERROR: partial character in source, but hit end");
-            return NULL;
-        }
-        break;
-        case targetExhausted: assert(0 && "unreachable");
-        case sourceIllegal: {
-            free(out);
-            fprintf(stderr, "ERROR: source sequence is illegal/malformed");
-            return NULL;
-        } break;
-    }
-    assert(0 && "unreachable");
-}
-
 void check_glob_located(const char *file, int line, const char *pattern, const char *text, Glob_Result expected)
 {
-    uint32_t *pattern_utf32 = decode_utf8(pattern);
-    if (pattern_utf32 == NULL) exit(1);
-    uint32_t *text_utf32 = decode_utf8(text);
-    if (text_utf32 == NULL) exit(1);
-
-    Glob_Result actual = glob(pattern_utf32, text_utf32);
+    Glob_Result actual = glob_utf8(pattern, text);
     printf("%12s <=> %-12s => %s\n", pattern, text, glob_result_display(actual));
     if (actual != expected) {
         printf("%s:%d: FAILURE! Expected %s", file, line, glob_result_display(expected));
         exit(1);
     }
-
-    free(pattern_utf32);
-    free(text_utf32);
 }
 
 #define check_glob(pattern, text, expected) check_glob_located(__FILE__, __LINE__, pattern, text, expected)