Browse Source

Update llvm binaries to latest version; Update utf8proc;

Ginger Bill 8 years ago
parent
commit
fb2d611dcd
9 changed files with 159 additions and 86 deletions
  1. BIN
      bin/llc.exe
  2. BIN
      bin/lli.exe
  3. BIN
      bin/opt.exe
  4. 2 2
      build.bat
  5. 0 1
      core/strconv.odin
  6. 1 0
      src/unicode.c
  7. 66 44
      src/utf8proc/utf8proc.c
  8. 80 24
      src/utf8proc/utf8proc.h
  9. 10 15
      src/utf8proc/utf8proc_data.c

BIN
bin/llc.exe


BIN
bin/lli.exe


BIN
bin/opt.exe


+ 2 - 2
build.bat

@@ -44,8 +44,8 @@ del *.ilk > NUL 2> NUL
 
 cl %compiler_settings% "src\main.c" ^
 	/link %linker_settings% -OUT:%exe_name% ^
-	&& odin build code/Jaze/src/main.odin
-	rem && odin run code/demo.odin
+	&& odin run code/demo.odin
+	rem && odin build code/Jaze/src/main.odin
 	rem && odin build_dll code/example.odin ^
 	rem odin run code/demo.odin
 

+ 0 - 1
core/strconv.odin

@@ -25,7 +25,6 @@ append_bool :: proc(buf: []byte, b: bool) -> string {
 }
 
 append_uint :: proc(buf: []byte, u: u64, base: int) -> string {
-	using Int_Flag;
 	return append_bits(buf, u, base, false, digits, 0);
 }
 append_int :: proc(buf: []byte, i: i64, base: int) -> string {

+ 1 - 0
src/unicode.c

@@ -6,6 +6,7 @@
 
 #pragma warning(pop)
 
+
 bool rune_is_letter(Rune r) {
 	if ((r < 0x80 && gb_char_is_alpha(cast(char)r)) ||
 	    r == '_') {

+ 66 - 44
src/utf8proc/utf8proc.c

@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
   if (uc < 0x00) {
     return 0;
   } else if (uc < 0x80) {
-    dst[0] = uc;
+    dst[0] = (utf8proc_uint8_t) uc;
     return 1;
   } else if (uc < 0x800) {
-    dst[0] = 0xC0 + (uc >> 6);
-    dst[1] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
     return 2;
   // Note: we allow encoding 0xd800-0xdfff here, so as not to change
   // the API, however, these are actually invalid in UTF-8
   } else if (uc < 0x10000) {
-    dst[0] = 0xE0 + (uc >> 12);
-    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[2] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
     return 3;
   } else if (uc < 0x110000) {
-    dst[0] = 0xF0 + (uc >> 18);
-    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[3] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
     return 4;
   } else return 0;
 }
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
    if (uc < 0x00) {
       return 0;
    } else if (uc < 0x80) {
-      dst[0] = uc;
+      dst[0] = (utf8proc_uint8_t)uc;
       return 1;
    } else if (uc < 0x800) {
-      dst[0] = 0xC0 + (uc >> 6);
-      dst[1] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
       return 2;
    } else if (uc == 0xFFFF) {
-       dst[0] = 0xFF;
+       dst[0] = (utf8proc_uint8_t)0xFF;
        return 1;
    } else if (uc == 0xFFFE) {
-       dst[0] = 0xFE;
+       dst[0] = (utf8proc_uint8_t)0xFE;
        return 1;
    } else if (uc < 0x10000) {
-      dst[0] = 0xE0 + (uc >> 12);
-      dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[2] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
       return 3;
    } else if (uc < 0x110000) {
-      dst[0] = 0xF0 + (uc >> 18);
-      dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-      dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[3] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
       return 4;
    } else return 0;
 }
@@ -383,7 +383,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
 }
 
 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
-  return (utf8proc_category_t)utf8proc_get_property(c)->category;
+  return utf8proc_get_property(c)->category;
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
@@ -391,11 +391,9 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
   return s[utf8proc_category(c)];
 }
 
-
-
 #define utf8proc_decompose_lump(replacement_uc) \
-  return utf8proc_decompose_char((utf8proc_int32_t)(replacement_uc), dst, bufsize, \
-  (utf8proc_option_t)((utf8proc_int32_t)options & ~UTF8PROC_LUMP), last_boundclass)
+  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
+  options & ~UTF8PROC_LUMP, last_boundclass)
 
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
   const utf8proc_property_t *property;
@@ -458,12 +456,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
       category == UTF8PROC_CATEGORY_ME) return 0;
   }
   if (options & UTF8PROC_CASEFOLD) {
-    if ((utf8proc_int16_t)property->casefold_seqindex != UINT16_MAX) {
+    if (property->casefold_seqindex != UINT16_MAX) {
       return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
     }
   }
   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
-    if ((utf8proc_int16_t)property->decomp_seqindex != UINT16_MAX &&
+    if (property->decomp_seqindex != UINT16_MAX &&
         (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
       return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
     }
@@ -485,6 +483,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
   utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
+) {
+    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
 ) {
   /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
   utf8proc_ssize_t wpos = 0;
@@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
         rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
         if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
       }
+      if (custom_func != NULL) {
+        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
+      }
       decomp_result = utf8proc_decompose_char(
         uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
         &boundclass
@@ -545,9 +554,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
   return wpos;
 }
 
-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
-  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
-     ASSERT: 'buffer' has one spare byte of free space at the end! */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
   if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
     utf8proc_ssize_t rpos;
     utf8proc_ssize_t wpos = 0;
@@ -621,7 +629,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
           starter_property = unsafe_get_property(*starter);
         }
         if (starter_property->comb_index < 0x8000 &&
-            (utf8proc_int16_t)current_property->comb_index != UINT16_MAX &&
+            current_property->comb_index != UINT16_MAX &&
             current_property->comb_index >= 0x8000) {
           int sidx = starter_property->comb_index;
           int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
@@ -655,6 +663,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
     }
     length = wpos;
   }
+  return length;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
+     ASSERT: 'buffer' has one spare byte of free space at the end! */
+  length = utf8proc_normalize_utf32(buffer, length, options);
+  if (length < 0) return length;
   {
     utf8proc_ssize_t rpos, wpos = 0;
     utf8proc_int32_t uc;
@@ -676,15 +692,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
+) {
+    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
 ) {
   utf8proc_int32_t *buffer;
   utf8proc_ssize_t result;
   *dstptr = NULL;
-  result = utf8proc_decompose(str, strlen, NULL, 0, options);
+  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
   if (result < 0) return result;
   buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
   if (!buffer) return UTF8PROC_ERROR_NOMEM;
-  result = utf8proc_decompose(str, strlen, buffer, result, options);
+  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
   if (result < 0) {
     free(buffer);
     return result;
@@ -705,29 +728,28 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
 
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
   utf8proc_uint8_t *retval;
-  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
-    UTF8PROC_DECOMPOSE));
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_DECOMPOSE);
   return retval;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
   utf8proc_uint8_t *retval;
-  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
-    UTF8PROC_COMPOSE));
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_COMPOSE);
   return retval;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
   utf8proc_uint8_t *retval;
-  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
-    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT));
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
   return retval;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
   utf8proc_uint8_t *retval;
-  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
-    UTF8PROC_COMPOSE | UTF8PROC_COMPAT));
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
   return retval;
 }
-

+ 80 - 24
src/utf8proc/utf8proc.h

@@ -71,14 +71,15 @@
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
 #define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 0
+#define UTF8PROC_VERSION_MINOR 1
 /** The PATCH version (increased for fixes that do not change the API). */
-#define UTF8PROC_VERSION_PATCH 2
+#define UTF8PROC_VERSION_PATCH 0
 /** @} */
 
 #include <stdlib.h>
-#include <sys/types.h>
-#ifdef _MSC_VER
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// MSVC prior to 2013 lacked stdbool.h and inttypes.h
 typedef signed char utf8proc_int8_t;
 typedef unsigned char utf8proc_uint8_t;
 typedef short utf8proc_int16_t;
@@ -93,12 +94,18 @@ typedef int utf8proc_ssize_t;
 typedef unsigned int utf8proc_size_t;
 #  endif
 #  ifndef __cplusplus
+// emulate C99 bool
 typedef unsigned char utf8proc_bool;
-// enum {false, true};
+#    ifndef __bool_true_false_are_defined
+#      define false 0
+#      define true 1
+#      define __bool_true_false_are_defined 1
+#    endif
 #  else
 typedef bool utf8proc_bool;
 #  endif
 #else
+#  include <stddef.h>
 #  include <stdbool.h>
 #  include <inttypes.h>
 typedef int8_t utf8proc_int8_t;
@@ -108,22 +115,12 @@ typedef uint16_t utf8proc_uint16_t;
 typedef int32_t utf8proc_int32_t;
 typedef uint32_t utf8proc_uint32_t;
 typedef size_t utf8proc_size_t;
-typedef ssize_t utf8proc_ssize_t;
+typedef ptrdiff_t utf8proc_ssize_t;
 typedef bool utf8proc_bool;
 #endif
 #include <limits.h>
 
-#ifdef _WIN32
-#  ifdef UTF8PROC_EXPORTS
-#    define UTF8PROC_DLLEXPORT __declspec(dllexport)
-#  else
-#    define UTF8PROC_DLLEXPORT /*__declspec(dllimport)*/
-#  endif
-#elif __GNUC__ >= 4
-#  define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
-#else
-#  define UTF8PROC_DLLEXPORT
-#endif
+#define UTF8PROC_DLLEXPORT
 
 #ifdef __cplusplus
 extern "C" {
@@ -134,7 +131,7 @@ extern "C" {
 #endif
 
 #ifndef UINT16_MAX
-#  define UINT16_MAX ~(utf8proc_uint16_t)0
+#  define UINT16_MAX 65535U
 #endif
 
 /**
@@ -373,6 +370,13 @@ typedef enum {
   UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;
 
+/**
+ * Function pointer type passed to @ref utf8proc_map_custom and
+ * @ref utf8proc_decompose_custom, which is used to specify a user-defined
+ * mapping of codepoints to be applied in conjunction with other mappings.
+ */
+typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
+
 /**
  * Array containing the byte lengths of a UTF-8 encoded codepoint based
  * on the first byte.
@@ -480,6 +484,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
  * `buffer` (which must contain at least `bufsize` entries).  In case of
  * success, the number of codepoints written is returned; in case of an
  * error, a negative error code is returned (@ref utf8proc_errmsg).
+ * See @ref utf8proc_decompose_custom to supply additional transformations.
  *
  * If the number of written codepoints would be bigger than `bufsize`, the
  * required buffer size is returned, while the buffer will be overwritten with
@@ -490,9 +495,47 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
   utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
 );
 
+/**
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
+/**
+ * Normalizes the sequence of `length` codepoints pointed to by `buffer`
+ * in-place (i.e., the result is also stored in `buffer`).
+ *
+ * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
+ * @param length the length (in codepoints) of the buffer.
+ * @param options a bitwise or (`|`) of one or more of the following flags:
+ * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
+ * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
+ * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
+ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
+ * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
+ *                           codepoints
+ * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
+ *                           the unicode versioning stability
+ *
+ * @return
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
+ * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ *
+ * @warning The entries of the array pointed to by `str` have to be in the
+ *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
+
 /**
  * Reencodes the sequence of `length` codepoints pointed to by `buffer`
  * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
  *
  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
  * @param length the length (in codepoints) of the buffer.
@@ -505,10 +548,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  *                           codepoints
  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
  *                           the unicode versioning stability
+ * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
  *
  * @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
- * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ * In case of success, the length (in bytes) of the resulting nul-terminated
+ * UTF-8 string is returned; otherwise, a negative error code is returned
+ * (@ref utf8proc_errmsg).
  *
  * @warning The amount of free space pointed to by `buffer` must
  *          exceed the amount of the input data by one byte, and the
@@ -595,7 +640,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
  * in any case the result will be NULL terminated (though it might
  * contain NULL characters with the string if `str` contained NULL
  * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described.  See also
+ * @ref utfproc_map_custom to supply a custom codepoint transformation.
  *
  * In case of success the length of the new string is returned,
  * otherwise a negative error code is returned.
@@ -607,6 +653,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 );
 
+/**
+ * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
 /** @name Unicode normalization
  *
  * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -619,9 +676,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 /** @} */
 
@@ -630,4 +687,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 #endif
 
 #endif
-

+ 10 - 15
src/utf8proc/utf8proc_data.c

@@ -1,7 +1,4 @@
-#pragma warning(push)
-#pragma warning(disable: 4838)
-
-const utf8proc_uint16_t utf8proc_sequences[] = {
+static const utf8proc_uint16_t utf8proc_sequences[] = {
   97, 98, 99, 100, 101, 102, 103,
   104, 105, 106, 107, 108, 109, 110, 111,
   112, 113, 114, 115, 116, 117, 118, 119,
@@ -1179,7 +1176,7 @@ const utf8proc_uint16_t utf8proc_sequences[] = {
   56603, 55354, 56604, 55354, 56605, 55354, 56606, 55354,
   56607, 55354, 56608, 55354, 56609, };
 
-const utf8proc_uint16_t utf8proc_stage1table[] = {
+static const utf8proc_uint16_t utf8proc_stage1table[] = {
   0, 256, 512, 768, 1024, 1280, 1536,
   1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584,
   3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632,
@@ -1726,7 +1723,7 @@ const utf8proc_uint16_t utf8proc_stage1table[] = {
   18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432,
   38656, };
 
-const utf8proc_uint16_t utf8proc_stage2table[] = {
+static const utf8proc_uint16_t utf8proc_stage2table[] = {
   1, 2, 2, 2, 2, 2, 2,
   2, 2, 3, 4, 3, 5, 6, 2,
   2, 2, 2, 2, 2, 2, 2, 2,
@@ -5899,7 +5896,7 @@ const utf8proc_uint16_t utf8proc_stage2table[] = {
   540, 540, 540, 1180, 0, 0, 0, 0,
   0, 1154, 1154, 1154, 1154, 1154, 1154, 1154,
   1154, 1154, 1154, 0, 0, 0, 0, 1103,
-  1158, 0, 0, 0, 0, 0, 0, 0,
+  1103, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0,
@@ -6593,7 +6590,7 @@ const utf8proc_uint16_t utf8proc_stage2table[] = {
   3984, 3984, 3984, 3984, 3984, 3984, 3984, 0,
   0, };
 
-const utf8proc_property_t utf8proc_properties[] = {
+static const utf8proc_property_t utf8proc_properties[] = {
   {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_CC, 0, UTF8PROC_BIDI_CLASS_BN, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, true, false, true, 0, 0, UTF8PROC_BOUNDCLASS_CONTROL},
   {UTF8PROC_CATEGORY_CC, 0, UTF8PROC_BIDI_CLASS_BN, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, true, 0, 0, UTF8PROC_BOUNDCLASS_CONTROL},
@@ -7850,7 +7847,7 @@ const utf8proc_property_t utf8proc_properties[] = {
   {UTF8PROC_CATEGORY_MN, 122, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND},
   {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9523, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9525, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
-  {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
+  {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_MN, 216, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND},
   {UTF8PROC_CATEGORY_PS, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_PE, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -10478,7 +10475,7 @@ const utf8proc_property_t utf8proc_properties[] = {
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1470, UINT16_MAX, 1470, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1478, UINT16_MAX, 1478, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5132, UINT16_MAX, 5132, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
-  {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+  {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5133, UINT16_MAX, 5133, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5134, UINT16_MAX, 5134, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1482, UINT16_MAX, 1482, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -12168,7 +12165,7 @@ const utf8proc_property_t utf8proc_properties[] = {
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6787, UINT16_MAX, 6787, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6789, UINT16_MAX, 6789, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6791, UINT16_MAX, 6791, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
-  {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+  {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6795, UINT16_MAX, 6795, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6797, UINT16_MAX, 6797, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6799, UINT16_MAX, 6799, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -12204,7 +12201,7 @@ const utf8proc_property_t utf8proc_properties[] = {
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9104, UINT16_MAX, 9104, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9106, UINT16_MAX, 9106, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9108, UINT16_MAX, 9108, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
-  {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+  {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9112, UINT16_MAX, 9112, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9114, UINT16_MAX, 9114, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
   {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9116, UINT16_MAX, 9116, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -13423,7 +13420,7 @@ const utf8proc_property_t utf8proc_properties[] = {
   {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, 0, 7975, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
 };
 
-const utf8proc_uint16_t utf8proc_combinations[] = {
+static const utf8proc_uint16_t utf8proc_combinations[] = {
   0, 46, 192, 193, 194, 195, 196, 197, 0,
   256, 258, 260, 550, 461, 0, 0, 512,
   514, 0, 0, 0, 0, 0, 0, 0,
@@ -14386,5 +14383,3 @@ const utf8proc_uint16_t utf8proc_combinations[] = {
 72, 75,
   1, 53694, 1, 53696,
 };
-
-#pragma warning(pop)