|
@@ -0,0 +1,223 @@
|
|
|
+// LICENSE
|
|
|
+//
|
|
|
+// This software is dual-licensed to the public domain and under the following
|
|
|
+// license: you are granted a perpetual, irrevocable license to copy, modify,
|
|
|
+// publish, and distribute this file as you see fit.
|
|
|
+//
|
|
|
+// VERSION
|
|
|
+// 0.2.0 (2017-02-18) Scored matches perform exhaustive search for best score
|
|
|
+// 0.1.0 (2016-03-28) Initial release
|
|
|
+//
|
|
|
+// AUTHOR
|
|
|
+// Forrest Smith
|
|
|
+//
|
|
|
+// NOTES
|
|
|
+// Compiling
|
|
|
+// You MUST add '#define FTS_FUZZY_MATCH_IMPLEMENTATION' before including this header in ONE source file to create implementation.
|
|
|
+//
|
|
|
+// fuzzy_match_simple(...)
|
|
|
+// Returns true if each character in pattern is found sequentially within str
|
|
|
+//
|
|
|
+// fuzzy_match(...)
|
|
|
+// Returns true if pattern is found AND calculates a score.
|
|
|
+// Performs exhaustive search via recursion to find all possible matches and match with highest score.
|
|
|
+// Scores values have no intrinsic meaning. Possible score range is not normalized and varies with pattern.
|
|
|
+// Recursion is limited internally (default=10) to prevent degenerate cases (pattern="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
|
|
|
+// Uses uint8_t for match indices. Therefore patterns are limited to 256 characters.
|
|
|
+// Score system should be tuned for YOUR use case. Words, sentences, file names, or method names all prefer different tuning.
|
|
|
+
|
|
|
+
|
|
|
+#ifndef FTS_FUZZY_MATCH_H
|
|
|
+#define FTS_FUZZY_MATCH_H
|
|
|
+
|
|
|
+
|
|
|
+#include <cstdint> // uint8_t
|
|
|
+#include <ctype.h> // ::tolower, ::toupper
|
|
|
+#include <cstring> // memcpy
|
|
|
+
|
|
|
+#include <cstdio>
|
|
|
+
|
|
|
+// Public interface
|
|
|
+namespace fts {
|
|
|
+ static bool fuzzy_match_simple(char const* pattern, char const* str);
|
|
|
+ static bool fuzzy_match(char const* pattern, char const* str, int& outScore);
|
|
|
+ static bool fuzzy_match(char const* pattern, char const* str, int& outScore, uint8_t* matches, int maxMatches);
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+#ifdef FTS_FUZZY_MATCH_IMPLEMENTATION
|
|
|
+namespace fts {
|
|
|
+
|
|
|
+ // Forward declarations for "private" implementation
|
|
|
+ namespace fuzzy_internal {
|
|
|
+ static bool fuzzy_match_recursive(const char* pattern, const char* str, int& outScore, const char* strBegin,
|
|
|
+ uint8_t const* srcMatches, uint8_t* newMatches, int maxMatches, int nextMatch,
|
|
|
+ int& recursionCount, int recursionLimit);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Public interface
|
|
|
+ static bool fuzzy_match_simple(char const* pattern, char const* str) {
|
|
|
+ while (*pattern != '\0' && *str != '\0') {
|
|
|
+ if (tolower(*pattern) == tolower(*str))
|
|
|
+ ++pattern;
|
|
|
+ ++str;
|
|
|
+ }
|
|
|
+
|
|
|
+ return *pattern == '\0' ? true : false;
|
|
|
+ }
|
|
|
+
|
|
|
+ static bool fuzzy_match(char const* pattern, char const* str, int& outScore) {
|
|
|
+
|
|
|
+ uint8_t matches[256];
|
|
|
+ return fuzzy_match(pattern, str, outScore, matches, sizeof(matches));
|
|
|
+ }
|
|
|
+
|
|
|
+ static bool fuzzy_match(char const* pattern, char const* str, int& outScore, uint8_t* matches, int maxMatches) {
|
|
|
+ int recursionCount = 0;
|
|
|
+ int recursionLimit = 10;
|
|
|
+
|
|
|
+ return fuzzy_internal::fuzzy_match_recursive(pattern, str, outScore, str, nullptr, matches, maxMatches, 0, recursionCount, recursionLimit);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Private implementation
|
|
|
+ static bool fuzzy_internal::fuzzy_match_recursive(const char* pattern, const char* str, int& outScore,
|
|
|
+ const char* strBegin, uint8_t const* srcMatches, uint8_t* matches, int maxMatches,
|
|
|
+ int nextMatch, int& recursionCount, int recursionLimit)
|
|
|
+ {
|
|
|
+ // Count recursions
|
|
|
+ ++recursionCount;
|
|
|
+ if (recursionCount >= recursionLimit)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ // Detect end of strings
|
|
|
+ if (*pattern == '\0' || *str == '\0')
|
|
|
+ return false;
|
|
|
+
|
|
|
+ // Recursion params
|
|
|
+ bool recursiveMatch = false;
|
|
|
+ uint8_t bestRecursiveMatches[256];
|
|
|
+ int bestRecursiveScore = 0;
|
|
|
+
|
|
|
+ // Loop through pattern and str looking for a match
|
|
|
+ bool first_match = true;
|
|
|
+ while (*pattern != '\0' && *str != '\0') {
|
|
|
+
|
|
|
+ // Found match
|
|
|
+ if (tolower(*pattern) == tolower(*str)) {
|
|
|
+
|
|
|
+ // Supplied matches buffer was too short
|
|
|
+ if (nextMatch >= maxMatches)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ // "Copy-on-Write" srcMatches into matches
|
|
|
+ if (first_match && srcMatches) {
|
|
|
+ memcpy(matches, srcMatches, nextMatch);
|
|
|
+ first_match = false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Recursive call that "skips" this match
|
|
|
+ uint8_t recursiveMatches[256];
|
|
|
+ int recursiveScore;
|
|
|
+ if (fuzzy_match_recursive(pattern, str + 1, recursiveScore, strBegin, matches, recursiveMatches, sizeof(recursiveMatches), nextMatch, recursionCount, recursionLimit)) {
|
|
|
+
|
|
|
+ // Pick best recursive score
|
|
|
+ if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
|
|
|
+ memcpy(bestRecursiveMatches, recursiveMatches, 256);
|
|
|
+ bestRecursiveScore = recursiveScore;
|
|
|
+ }
|
|
|
+ recursiveMatch = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Advance
|
|
|
+ matches[nextMatch++] = (uint8_t)(str - strBegin);
|
|
|
+ // Clear the next char so that we know which match is the last one
|
|
|
+ matches[nextMatch + 1] = 0;
|
|
|
+ ++pattern;
|
|
|
+ }
|
|
|
+ ++str;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Determine if full pattern was matched
|
|
|
+ bool matched = *pattern == '\0' ? true : false;
|
|
|
+
|
|
|
+ // Calculate score
|
|
|
+ if (matched) {
|
|
|
+ const int sequential_bonus = 15; // bonus for adjacent matches
|
|
|
+ const int separator_bonus = 30; // bonus if match occurs after a separator
|
|
|
+ const int camel_bonus = 30; // bonus if match is uppercase and prev is lower
|
|
|
+ const int first_letter_bonus = 15; // bonus if the first letter is matched
|
|
|
+
|
|
|
+ const int leading_letter_penalty = -5; // penalty applied for every letter in str before the first match
|
|
|
+ const int max_leading_letter_penalty = -15; // maximum penalty for leading letters
|
|
|
+ const int unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter
|
|
|
+
|
|
|
+ // Iterate str to end
|
|
|
+ while (*str != '\0')
|
|
|
+ ++str;
|
|
|
+
|
|
|
+ // Initialize score
|
|
|
+ outScore = 100;
|
|
|
+
|
|
|
+ // Apply leading letter penalty
|
|
|
+ int penalty = leading_letter_penalty * matches[0];
|
|
|
+ if (penalty < max_leading_letter_penalty)
|
|
|
+ penalty = max_leading_letter_penalty;
|
|
|
+ outScore += penalty;
|
|
|
+
|
|
|
+ // Apply unmatched penalty
|
|
|
+ int unmatched = (int)(str - strBegin) - nextMatch;
|
|
|
+ outScore += unmatched_letter_penalty * unmatched;
|
|
|
+
|
|
|
+ // Apply ordering bonuses
|
|
|
+ for (int i = 0; i < nextMatch; ++i) {
|
|
|
+ uint8_t currIdx = matches[i];
|
|
|
+
|
|
|
+ if (i > 0) {
|
|
|
+ uint8_t prevIdx = matches[i - 1];
|
|
|
+
|
|
|
+ // Sequential
|
|
|
+ if (currIdx == (prevIdx + 1))
|
|
|
+ outScore += sequential_bonus;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for bonuses based on neighbor character value
|
|
|
+ if (currIdx > 0) {
|
|
|
+ // Camel case
|
|
|
+ char neighbor = strBegin[currIdx - 1];
|
|
|
+ char curr = strBegin[currIdx];
|
|
|
+ if (::islower(neighbor) && ::isupper(curr))
|
|
|
+ outScore += camel_bonus;
|
|
|
+
|
|
|
+ // Separator
|
|
|
+ bool neighborSeparator = neighbor == '_' || neighbor == ' ';
|
|
|
+ if (neighborSeparator)
|
|
|
+ outScore += separator_bonus;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ // First letter
|
|
|
+ outScore += first_letter_bonus;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Return best result
|
|
|
+ if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
|
|
|
+ // Recursive score is better than "this"
|
|
|
+ memcpy(matches, bestRecursiveMatches, maxMatches);
|
|
|
+ outScore = bestRecursiveScore;
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ else if (matched) {
|
|
|
+ // "this" score is better than recursive
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ // no match
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+} // namespace fts
|
|
|
+
|
|
|
+#endif // FTS_FUZZY_MATCH_IMPLEMENTATION
|
|
|
+
|
|
|
+#endif // FTS_FUZZY_MATCH_H
|