/////////////////////////////////////////////////////////////////////////////// // Copyright (c) Electronic Arts Inc. All rights reserved. /////////////////////////////////////////////////////////////////////////////// #include #include #include ///////////////////////////////////////////////////////////////////////////// // EATEXTUTIL_MIN / EATEXTUTIL_MAX // #define EATEXTUTIL_MIN(a, b) ((a) < (b) ? (a) : (b)) #define EATEXTUTIL_MAX(a, b) ((a) > (b) ? (a) : (b)) namespace EA { namespace StdC { extern uint8_t utf8lengthTable[256]; /////////////////////////////////////////////////////////////////////////////// // UTF8Validate // // There are multiple definitions of what a valid UTF8 string is. UTF8 allows // the ability to encode the same UTF16 character in multiple ways. This in // one sense is a legal UTF8 array. However, for some security reasons it is // sometimes considered that a UTF8 array is illegal (or at least 'unsafe') // if it encodes some character with more bytes than needed. Actually the // Unicode standard v3.0 says that these 'insecure' UTF8 sequences are // formally illegal to generate but not illegal to interpret. // See "http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html" // // We take the high-security approach here, though it is slower. We could write // a simpler function that does a non-security check with the simple table // of info here: // 0x00-0x7f are single standalone bytes. // 0xc2-0xFD are first byte of a multi-byte sequence. // 0xc2-0xdf are first byte of a pair. // 0xe0-0xef are first byte of a triplet. // 0x00-0xf7 are first byte of a quadruplet. // 0xf8-0xfb are first byte of a 5-tuplet. // 0xfc-0xfd are first byte of a 6-tuplet. // 0xfe-0xff are invalid bytes anywhere in a UTF8 string. // 0x80-0xbf are the second-sixth byte of a multi-byte sequence, though not all values are valid for all such bytes. // // See 'http://www.cl.cam.ac.uk/~mgk25/unicode.html' or search for "UTF8 FAQ" // on the Internet for more details on UTF8 and Unicode. // EASTDC_API bool UTF8Validate(const char* pText, size_t nLength) { const uint8_t* pSource8 = (const uint8_t*)pText; const uint8_t* const pSource8End = pSource8 + nLength; while(pSource8 < pSource8End) { if(pSource8[0] < 0x80) ++pSource8; else if(pSource8[0] < 0xC2) break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. else if(pSource8[0] < 0xE0) // If 2 input chars result in 1 output char... { if(pSource8End - pSource8 >= 2) { if(!((pSource8[1] ^ 0x80) < 0x40)) break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. pSource8 += 2; } else break; //The input string is not long enough to finish reading the current character. } else if(pSource8[0] < 0xF0) // If 3 input chars result in 1 output char... { if((pSource8End - pSource8) >= 3) { if(!(((pSource8[1] ^ 0x80) < 0x40) && ((pSource8[2] ^ 0x80) < 0x40) && (pSource8[0] >= 0xE1 || pSource8[1] >= 0xA0))) break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. pSource8 += 3; } else break; //The input string is not long enough to finish reading the current character. } else if(pSource8[0] < 0xF8) // If 4 input chars result in 1 output char... { if((pSource8End - pSource8) >= 4) { if(!(((pSource8[1] ^ 0x80) < 0x40) && ((pSource8[2] ^ 0x80) < 0x40) && ((pSource8[3] ^ 0x80) < 0x40) && (pSource8[0] >= 0xF1 || pSource8[1] >= 0x90))) break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. pSource8 += 4; } else break; //The input string is not long enough to finish reading the current character. } else if(pSource8[0] < 0xFC) // If 5 input chars result in 1 output char... { if((pSource8End - pSource8) >= 5) { if(!(((pSource8[1] ^ 0x80) < 0x40) && ((pSource8[2] ^ 0x80) < 0x40) && ((pSource8[3] ^ 0x80) < 0x40) && ((pSource8[4] ^ 0x80) < 0x40) && (pSource8[0] >= 0xf9 || pSource8[1] >= 0x88))) break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. pSource8 += 5; } else break; //The input string is not long enough to finish reading the current character. } else if(pSource8[0] < 0xFE) // If 6 input chars result in 1 output char... { if((pSource8End - pSource8) >= 6) { if(!(((pSource8[1] ^ 0x80) < 0x40) && ((pSource8[2] ^ 0x80) < 0x40) && ((pSource8[3] ^ 0x80) < 0x40) && ((pSource8[4] ^ 0x80) < 0x40) && ((pSource8[5] ^ 0x80) < 0x40) && (pSource8[0] >= 0xfd || pSource8[1] >= 0x84))) break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't. pSource8 += 6; } else break; //The input string is not long enough to finish reading the current character. } else //Else the current input char is invalid. break; } return (pSource8 == pSource8End); // The return value is OK if we successfully processed all characters. } // Returns the pointer p incremented by n multibyte characters. // The string must be a valid UTF8 string or else the behavior is undefined. // If the string is not known to be valid, then it should be first validated independently // or a validating version of this function should be used instead. EASTDC_API char* UTF8Increment(const char* p, size_t n) { while(n--) { // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp const int c = (uint8_t)*p; if (c <= 0xc1) // Actually, any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char. p += 1; else if(c <= 0xdf) p += 2; else if(c <= 0xef) p += 3; else if(c <= 0xf7) p += 4; else if(c <= 0xfb) p += 5; else if(c <= 0xfd) p += 6; else p += 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer. } return (char*)p; } // Returns the pointer p decremented by n multibyte characters. // The string must be decrementable by the given number of characters or else // the behavior becomes undefined. // The string must be a valid UTF8 string or else the behavior is undefined. // If the string is not known to be valid, then it should be first validated independently // or a validating version of this function should be used instead. EASTDC_API char* UTF8Decrement(const char* p, size_t n) { while(n) { if(!UTF8IsFollowByte(*--p)) --n; } return (char*)p; } // Returns number of Unicode characters are in the UTF8-encoded string. // Return value will be <= Strlen(pString). // The string p must be 0-terminated or the behavior of this function is undefined. // The string must be a valid UTF8 string or else the behavior is undefined. // If the string is not known to be valid, then it should be first validated independently // or a validating version of this function should be used instead. EASTDC_API size_t UTF8Length(const char* p) { size_t n = 0; while(*p) { if((*p & 0xc0) != 0x80) // If this is a leading char... ++n; ++p; } return n; } // Returns number of characters that would be in a UTF8-encoded string. // Return value will be >= Strlen(pString). // The string p must be 0-terminated or the behavior of this function is undefined. EASTDC_API size_t UTF8Length(const char16_t* p) { size_t n = 0; uint32_t c; while((c = *p++) != 0) { if(c < 0x00000080) n += 1; else if(c < 0x00000800) n += 2; else // if(c < 0x00010000) n += 3; } return n; } // Returns number of characters that would be in a UTF8-encoded string. // Return value will be >= Strlen(pString). // The string p must be 0-terminated or the behavior of this function is undefined. // Assumes the input values are valid, else the return value will be wrong. EASTDC_API size_t UTF8Length(const char32_t* p) { size_t n = 0; uint32_t c; while((c = (uint32_t)*p++) != 0) { if(c < 0x00000080) n += 1; else if(c < 0x00000800) n += 2; else if(c < 0x00010000) n += 3; else if(c < 0x00200000) n += 4; else if(c < 0x04000000) n += 5; else if(c <= 0x7fffffff) n += 6; else n += 1; // Error } return n; } /////////////////////////////////////////////////////////////////////////////// // UTF8CharSize // // Returns the byte length of the UTF8 multibyte char pointed to by p. // The input p must point to the beginning of a UTF8 multibyte sequence, // else the return value is 1. // // 0x00-0x80 are single bytes. // 0x81-0xc1 are invalid values for a leading UTF8 char. // 0xc2-0xdf are first byte of a pair. // 0xe0-0xef are first byte of a triplet. // 0xf0-0xf7 are first byte of a quadruplet. // 0xf8-0xfb are first byte of a 5-tuplet. // 0xfc-0xfd are first byte of a 6-tuplet. // 0xfe-0xff are invalid values for a leading UTF8 char. // EASTDC_API size_t UTF8CharSize(const char* p) { // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp const int c = (uint8_t)*p; if (c <= 0xc1) // Any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char. return 1; else if(c <= 0xdf) return 2; else if(c <= 0xef) return 3; else if(c <= 0xf7) // This refers to a unicode point > char16_t return 4; else if(c <= 0xfb) // This refers to a unicode point > char16_t return 5; else if(c <= 0xfd) // This refers to a unicode point > char16_t return 6; return 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer. } EASTDC_API size_t UTF8CharSize(char16_t c) { if(c < 0x00000080) return 1; else if(c < 0x00000800) return 2; else // if(c < 0x00010000) return 3; // The following would be used if the input was 32 bit instead of 16 bit. //else if(c < 0x00010000) // return 3; //else if(c < 0x00200000) // return 4; //else if(c < 0x04000000) // return 5; //else if(c <= 0x7fffffff) // return 6; // //return 1; // Error } EASTDC_API size_t UTF8CharSize(char32_t c) { if((uint32_t)c < 0x00000080) return 1; else if((uint32_t)c < 0x00000800) return 2; else if((uint32_t)c < 0x00010000) return 3; else if((uint32_t)c < 0x00200000) return 4; else if((uint32_t)c < 0x04000000) return 5; else if((uint32_t)c < 0x80000000) return 6; return 1; // Error } EASTDC_API char16_t UTF8ReadChar(const char* p, const char** ppEnd) { char16_t c = 0; const char* pCurrent; uint8_t cChar0((uint8_t)*p), cChar1, cChar2, cChar3; //assert((cChar0 != 0xFE) && (cChar0 != 0xFF)); // No byte can contain 0xFE or 0xFF if(cChar0 < 0x80) { c = cChar0; pCurrent = p + 1; } else { //assert((cChar0 & 0xC0) == 0xC0); // The top two bits need to be equal to 1 if((cChar0 & 0xE0) == 0xC0) { c = (char16_t)((cChar0 & 0x1F) << 6); cChar1 = static_cast(p[1]); //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= cChar1 & 0x3F; //assert(c >= 0x0080 && c < 0x0800); // Check that we have the smallest coding pCurrent = p + 2; } else if((cChar0 & 0xF0) == 0xE0) { c = (char16_t)((cChar0 & 0xF) << 12); cChar1 = static_cast(p[1]); //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= (cChar1 & 0x3F) << 6; cChar2 = static_cast(p[2]); //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= cChar2 & 0x3F; //assert(c >= 0x00000800 && c < 0x00010000); // Check that we have the smallest coding pCurrent = p + 3; } else { //assert((cChar0 & 0xf8) == 0xf0); // We handle the unicode but not UCS-4 c = (char16_t)((cChar0 & 0x7) << 18); cChar1 = static_cast(p[1]); //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= (char16_t)((cChar1 & 0x3F) << 12); cChar2 = static_cast(p[2]); //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= (cChar2 & 0x3F) << 6; cChar3 = static_cast(p[3]); //assert((cChar3 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx c |= cChar3 & 0x3F; //assert(c >= 0x00010000 && c <= 0x0010FFFF); // Check that we have the smallest coding, Unicode and not ucs-4 pCurrent = p + 4; } } if(ppEnd) *ppEnd = pCurrent; return c; } // This function assumes that there is enough space at p to write the char. // At most three bytes are needed to write a char16_t value and 6 bytes are // needed to write a char32_t value. EASTDC_API char* UTF8WriteChar(char* p, char16_t c) { if(c < 0x80) { *p++ = (char)(uint8_t)c; } else if(c < 0x0800) { *p++ = (char)(uint8_t)((c >> 6) | 0xC0); *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); } else // if(c < 0x00010000) { *p++ = (char)(uint8_t)((c >> 12) | 0xE0); *p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80); *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); } //else //{ // *p++ = (char)(uint8_t)((c >> 18) | 0xF0); // *p++ = (char)(uint8_t)(((c >> 12) & 0x3F) | 0x80); // *p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80); // *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); //} return p; } // This function assumes that there is enough space at p to write the char. // At most three bytes are needed to write a char32_t value and 6 bytes are // needed to write a char32_t value. EASTDC_API char* UTF8WriteChar(char* p, char32_t c) { if((uint32_t)c < 0x80) { *p++ = (char)(uint8_t)c; } else if((uint32_t)c < 0x0800) { *p++ = (char)(uint8_t)((c >> 6) | 0xC0); *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); } else if((uint32_t)c < 0x00010000) { *p++ = (char)(uint8_t)((c >> 12) | 0xE0); *p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80); *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); } else { *p++ = (char)(uint8_t)((c >> 18) | 0xF0); *p++ = (char)(uint8_t)(((c >> 12) & 0x3F) | 0x80); *p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80); *p++ = (char)(uint8_t)((c & 0x3F) | 0x80); } return p; } /// UTF8TrimPartialChar /// /// Trim the string to the last valid UTF8 character. This function has no effect on a UTF8 string that has /// entirely valid UTF8 content. It only trims the string if there is an incomplete UTF8 sequence at the /// end. The resulting string will always be a valid UTF8 string, whereas the input string may not be. /// Returns the strlen of the trimmed string. size_t UTF8TrimPartialChar(char* pString, size_t nLength) { size_t validPos = 0; while(validPos < nLength) { uint8_t ch = (uint8_t)pString[validPos]; size_t length = utf8lengthTable[ch]; // length = 0 means invalid UTF8 marker if((length == 0) || ((validPos + length) > nLength)) break; else validPos += length; } pString[validPos] = 0; return validPos; } /////////////////////////////////////////////////////////////////////////////// // UTF8ReplaceInvalidChar // // This function replaces all invalidate UTF8 characters with the user provided // 8-bit replacement. The returned character array is guaranteed null-terminated. // EASTDC_API char* UTF8ReplaceInvalidChar(const char* pIn, size_t nLength, char* pOut, char replaceWith) { size_t validPos = 0; while(validPos < nLength) { uint8_t ch = (uint8_t)pIn[validPos]; size_t length = utf8lengthTable[ch]; // length = 0 means invalid UTF8 marker if((length == 0) || ((validPos + length) > nLength)) { pOut[validPos++] = replaceWith; } else { for(auto i = validPos; i < validPos + length; i++) pOut[i] = pIn[i]; validPos += length; } } pOut[validPos] = 0; return pOut + validPos; } /////////////////////////////////////////////////////////////////////////////// // MatchPattern // // This function is recursively called on substrings. // Used by the WildcardMatch function. // template bool MatchPattern(const CharT* pElement, const CharT* pPattern) { if((*pPattern == (CharT)'*') && !pPattern[1]) return true; // The pattern is set to match everything, so return true. else if(!*pElement && *pPattern) return false; // The element is empty but the pattern is not, so return false. else if(!*pElement) return true; // The element and pattern are both empty, so we are done. Return true. else { if(*pPattern == (CharT)'*') { if(MatchPattern(pElement, pPattern+1)) // What this section does is try to match source segments to return true; // the '*' portion of the pattern. As many parts of the source that else // can be assigned to the '*' portion of the pattern are done. If return MatchPattern(pElement+1, pPattern); // not possible, we pop out of the whole thing. } else if(*pPattern == (CharT)'?') return MatchPattern(pElement+1, pPattern+1); // The pattern accepts any character here, so move onto the next character. else { if(*pElement == *pPattern) return MatchPattern(pElement+1, pPattern+1); // The current element and pattern chars match, so move onto next character. else return false; // The current element char simply doesn't match the pattern char, so return false. } } // return true; // This should never get executed, but some compilers might not be smart enough to realize it. } /////////////////////////////////////////////////////////////////////////////// // WildcardMatch // // We go through extra effort below to avoid doing memory allocation in most cases. // EASTDC_API bool WildcardMatch(const char* pString, const char* pPattern, bool bCaseSensitive) { if(bCaseSensitive) return MatchPattern(pString, pPattern); else { // Do efficient string conversion to lower case... char pStringLBuffer[384]; char* pStringL; char* pStringLAllocated; size_t nStringLLength = Strlen(pString); if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1)) { pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char[]") char[nStringLLength + 1]; pStringL = pStringLAllocated; } else { pStringLAllocated = NULL; pStringL = pStringLBuffer; } Strcpy(pStringL, pString); Strlwr(pStringL); // Do efficient pattern conversion to lower case... char pPatternLBuffer[32]; char* pPatternL; char* pPatternLAllocated; size_t nPatternLLength = Strlen(pPattern); if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1)) { pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char[]") char[nPatternLLength + 1]; pPatternL = pPatternLAllocated; } else { pPatternLAllocated = NULL; pPatternL = pPatternLBuffer; } Strcpy(pPatternL, pPattern); Strlwr(pPatternL); const bool bResult = MatchPattern(pStringL, pPatternL); delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect. delete[] pPatternLAllocated; return bResult; } } /////////////////////////////////////////////////////////////////////////////// // WildcardMatch // // We go through extra effort below to avoid doing memory allocation in most cases. // EASTDC_API bool WildcardMatch(const char16_t* pString, const char16_t* pPattern, bool bCaseSensitive) { if(bCaseSensitive) return MatchPattern(pString, pPattern); else { // Do efficient string conversion to lower case... char16_t pStringLBuffer[384]; char16_t* pStringL; char16_t* pStringLAllocated; size_t nStringLLength = Strlen(pString); if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1)) { pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char16[]") char16_t[nStringLLength + 1]; pStringL = pStringLAllocated; } else { pStringLAllocated = NULL; pStringL = pStringLBuffer; } Strcpy(pStringL, pString); Strlwr(pStringL); // Do efficient pattern conversion to lower case... char16_t pPatternLBuffer[32]; char16_t* pPatternL; char16_t* pPatternLAllocated; size_t nPatternLLength = Strlen(pPattern); if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1)) { pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char16[]") char16_t[nPatternLLength + 1]; pPatternL = pPatternLAllocated; } else { pPatternLAllocated = NULL; pPatternL = pPatternLBuffer; } Strcpy(pPatternL, pPattern); Strlwr(pPatternL); const bool bResult = MatchPattern(pStringL, pPatternL); delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect. delete[] pPatternLAllocated; return bResult; } } /////////////////////////////////////////////////////////////////////////////// // WildcardMatch // // We go through extra effort below to avoid doing memory allocation in most cases. // EASTDC_API bool WildcardMatch(const char32_t* pString, const char32_t* pPattern, bool bCaseSensitive) { if(bCaseSensitive) return MatchPattern(pString, pPattern); else { // Do efficient string conversion to lower case... char32_t pStringLBuffer[384]; char32_t* pStringL; char32_t* pStringLAllocated; size_t nStringLLength = Strlen(pString); if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1)) { pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char32[]") char32_t[nStringLLength + 1]; pStringL = pStringLAllocated; } else { pStringLAllocated = NULL; pStringL = pStringLBuffer; } Strcpy(pStringL, pString); Strlwr(pStringL); // Do efficient pattern conversion to lower case... char32_t pPatternLBuffer[32]; char32_t* pPatternL; char32_t* pPatternLAllocated; size_t nPatternLLength = Strlen(pPattern); if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1)) { pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char32[]") char32_t[nPatternLLength + 1]; pPatternL = pPatternLAllocated; } else { pPatternLAllocated = NULL; pPatternL = pPatternLBuffer; } Strcpy(pPatternL, pPattern); Strlwr(pPatternL); const bool bResult = MatchPattern(pStringL, pPatternL); delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect. delete[] pPatternLAllocated; return bResult; } } ////////////////////////////////////////////////////////////////////////// // GetTextLine // EASTDC_API const char* GetTextLine(const char* pText, const char* pTextEnd, const char** ppNewText) { if(pText < pTextEnd) { while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n')) ++pText; if(ppNewText) { *ppNewText = pText; if(*ppNewText < pTextEnd) { if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n')) ++*ppNewText; } } } else if(ppNewText) *ppNewText = pTextEnd; return pText; } ////////////////////////////////////////////////////////////////////////// // GetTextLine // EASTDC_API const char16_t* GetTextLine(const char16_t* pText, const char16_t* pTextEnd, const char16_t** ppNewText) { if(pText < pTextEnd) { while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n')) ++pText; if(ppNewText) { *ppNewText = pText; if(*ppNewText < pTextEnd) { if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n')) ++*ppNewText; } } } else if(ppNewText) *ppNewText = pTextEnd; return pText; } ////////////////////////////////////////////////////////////////////////// // GetTextLine // EASTDC_API const char32_t* GetTextLine(const char32_t* pText, const char32_t* pTextEnd, const char32_t** ppNewText) { if(pText < pTextEnd) { while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n')) ++pText; if(ppNewText) { *ppNewText = pText; if(*ppNewText < pTextEnd) { if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n')) ++*ppNewText; } } } else if(ppNewText) *ppNewText = pTextEnd; return pText; } EASTDC_API bool ParseDelimitedText(const char* pText, const char* pTextEnd, char cDelimiter, const char*& pToken, const char*& pTokenEnd, const char** ppNewText) { int nQuoteLevel = 0; bool bDelimiterFound = false; // We remove leading spaces. for(pToken = pText; pToken < pTextEnd; ++pToken) { if((*pToken != ' ') && (*pToken != '\t')) break; } for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd) { const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd); if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t')); else bDelimiterFound = (*pTokenEnd == cDelimiter); if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character... { if(!bDelimiterFound) ++pTokenEnd; const bool bInQuotes = ((nQuoteLevel & 1) != 0); if(!bInQuotes || bLastCharacter) // If not within a quoted section... { if(ppNewText) *ppNewText = pTokenEnd; if((cDelimiter != ' ') && (pTokenEnd != pTextEnd)) { // Eliminate spaces before the trailing delimiter. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t'))) pTokenEnd--; } if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"')) { pToken++; pTokenEnd--; } return true; } } else if(*pTokenEnd == '"') nQuoteLevel++; } if(ppNewText) *ppNewText = pTokenEnd; return false; } ////////////////////////////////////////////////////////////////////////// // ParseDelimitedText // // This function takes a line text that has fields separated by delimiters // and parses the line into the component fields. It is common to read // command lines like this or to parse ini file settings like this. // EASTDC_API bool ParseDelimitedText(const char16_t* pText, const char16_t* pTextEnd, char16_t cDelimiter, const char16_t*& pToken, const char16_t*& pTokenEnd, const char16_t** ppNewText) { int nQuoteLevel = 0; bool bDelimiterFound = false; // We remove leading spaces. for(pToken = pText; pToken < pTextEnd; ++pToken) { if((*pToken != ' ') && (*pToken != '\t')) break; } for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd) { const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd); if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t')); else bDelimiterFound = (*pTokenEnd == cDelimiter); if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character... { if(!bDelimiterFound) ++pTokenEnd; const bool bInQuotes = ((nQuoteLevel & 1) != 0); if(!bInQuotes || bLastCharacter) // If not within a quoted section... { if(ppNewText) *ppNewText = pTokenEnd; if((cDelimiter != ' ') && (pTokenEnd != pTextEnd)) { // Eliminate spaces before the trailing delimiter. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t'))) pTokenEnd--; } if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"')) { pToken++; pTokenEnd--; } return true; } } else if(*pTokenEnd == '"') nQuoteLevel++; } if(ppNewText) *ppNewText = pTokenEnd; return false; } ////////////////////////////////////////////////////////////////////////// // ParseDelimitedText // // This function takes a line text that has fields separated by delimiters // and parses the line into the component fields. It is common to read // command lines like this or to parse ini file settings like this. // EASTDC_API bool ParseDelimitedText(const char32_t* pText, const char32_t* pTextEnd, char32_t cDelimiter, const char32_t*& pToken, const char32_t*& pTokenEnd, const char32_t** ppNewText) { int nQuoteLevel = 0; bool bDelimiterFound = false; // We remove leading spaces. for(pToken = pText; pToken < pTextEnd; ++pToken) { if((*pToken != ' ') && (*pToken != '\t')) break; } for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd) { const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd); if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t')); else bDelimiterFound = (*pTokenEnd == cDelimiter); if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character... { if(!bDelimiterFound) ++pTokenEnd; const bool bInQuotes = ((nQuoteLevel & 1) != 0); if(!bInQuotes || bLastCharacter) // If not within a quoted section... { if(ppNewText) *ppNewText = pTokenEnd; if((cDelimiter != ' ') && (pTokenEnd != pTextEnd)) { // Eliminate spaces before the trailing delimiter. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t'))) pTokenEnd--; } if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"')) { pToken++; pTokenEnd--; } return true; } } else if(*pTokenEnd == '"') nQuoteLevel++; } if(ppNewText) *ppNewText = pTokenEnd; return false; } /////////////////////////////////////////////////////////////////////////////// // ConvertBinaryDataToASCIIArray // // Since every binary byte converts to exactly 2 ascii bytes, the ASCII // array must have space for at least twice the amount of bytes // as 'nBinaryDataLength' + 1. // EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char* pASCIIArray) { const uint8_t* pBinaryData = (uint8_t*)pBinaryData_; const uint8_t* pEnd = pBinaryData + nBinaryDataLength; while(pBinaryData < pEnd) { *pASCIIArray = (char)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; *pASCIIArray = (char)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; pBinaryData++; } *pASCIIArray = '\0'; } EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char16_t* pASCIIArray) { const uint8_t* pBinaryData = (uint8_t*)pBinaryData_; const uint8_t* pEnd = pBinaryData + nBinaryDataLength; while(pBinaryData < pEnd) { *pASCIIArray = (char16_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; *pASCIIArray = (char16_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; pBinaryData++; } *pASCIIArray = '\0'; } EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char32_t* pASCIIArray) { const uint8_t* pBinaryData = (uint8_t*)pBinaryData_; const uint8_t* pEnd = pBinaryData + nBinaryDataLength; while(pBinaryData < pEnd) { *pASCIIArray = (char32_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; *pASCIIArray = (char32_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15. if(*pASCIIArray > '9') *pASCIIArray += 7; // Convert the ':' to 'A', for example. pASCIIArray++; pBinaryData++; } *pASCIIArray = '\0'; } ////////////////////////////////////////////////////////////////////////////// // ConvertASCIIArrayToBinaryData (8 bit version) // // We have a boolean return value because it is possible that the ascii data is // corrupt. We check for this corruption and return false if so, while converting // all corrupt bytes to valid ones. // EASTDC_API bool ConvertASCIIArrayToBinaryData(const char* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData) { uint8_t* pBinaryData8 = (uint8_t*)pBinaryData; const char* pEnd = pASCIIArray + nASCIIArrayLength; char cTemp; bool bReturnValue(true); while(pASCIIArray < pEnd) { *pBinaryData8 = 0; for(int j = 4; j >= 0; j -= 4) { cTemp = *pASCIIArray; if(cTemp < '0') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp > 'F') // Do some bounds checking. { if(cTemp >= 'a' && cTemp <= 'f') cTemp -= 39; // Convert 'a' to ':'. else { cTemp = '0'; bReturnValue = false; } } else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp >= 'A') cTemp -= 7; *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j)); pASCIIArray++; } pBinaryData8++; } return bReturnValue; } ////////////////////////////////////////////////////////////////////////////// // ConvertASCIIArrayToBinaryData (16 bit version) // // We have a boolean return value because it is possible that the ascii data is // corrupt. We check for this corruption and return false if so, while converting // all corrupt bytes to valid ones. // EASTDC_API bool ConvertASCIIArrayToBinaryData(const char16_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData) { uint8_t* pBinaryData8 = (uint8_t*)pBinaryData; const char16_t* pEnd = pASCIIArray + nASCIIArrayLength; char16_t cTemp; bool bReturnValue(true); while(pASCIIArray < pEnd) { *pBinaryData8 = 0; for(int j = 4; j >= 0; j -= 4) { cTemp = *pASCIIArray; if(cTemp < '0') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp > 'F') // Do some bounds checking. { if(cTemp >= 'a' && cTemp <= 'f') cTemp -= 39; // Convert 'a' to ':'. else { cTemp = '0'; bReturnValue = false; } } else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp >= 'A') cTemp -= 7; *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j)); pASCIIArray++; } pBinaryData8++; } return bReturnValue; } ////////////////////////////////////////////////////////////////////////////// // ConvertASCIIArrayToBinaryData (32 bit version) // // We have a boolean return value because it is possible that the ascii data is // corrupt. We check for this corruption and return false if so, while converting // all corrupt bytes to valid ones. // EASTDC_API bool ConvertASCIIArrayToBinaryData(const char32_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData) { uint8_t* pBinaryData8 = (uint8_t*)pBinaryData; const char32_t* pEnd = pASCIIArray + nASCIIArrayLength; char32_t cTemp; bool bReturnValue(true); while(pASCIIArray < pEnd) { *pBinaryData8 = 0; for(int j = 4; j >= 0; j -= 4) { cTemp = *pASCIIArray; if(cTemp < '0') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp > 'F') // Do some bounds checking. { if(cTemp >= 'a' && cTemp <= 'f') cTemp -= 39; // Convert 'a' to ':'. else { cTemp = '0'; bReturnValue = false; } } else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking. { cTemp = '0'; bReturnValue = false; } else if(cTemp >= 'A') cTemp -= 7; *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j)); pASCIIArray++; } pBinaryData8++; } return bReturnValue; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenDelimited (8 bit version) // EASTDC_API bool SplitTokenDelimited(const char* pSource, size_t nSourceLength, char cDelimiter, char* pToken, size_t nTokenLength, const char** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = 0; if(pSource && nSourceLength && *pSource) { // look for the delimiter for(size_t i = 0; i < nSourceLength && *pSource; i++) { const char cTemp(*pSource); // update new source pointer if present if(ppNewSource) (*ppNewSource)++; if(cTemp == cDelimiter) // If there is a delimiter match... break; // We are done. else { // keep moving characters into the token until we find the delimiter or reached the end of the token string if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null { *pToken = cTemp; // add the character pToken++; // increment the token pointer *pToken = 0; // insert terminating null character } pSource++; // increment source pointer } } return true; } return false; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenDelimited (16 bit version) // // Implemented by Blazej Stompel and Paul Pedriana // EASTDC_API bool SplitTokenDelimited(const char16_t* pSource, size_t nSourceLength, char16_t cDelimiter, char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = 0; if(pSource && nSourceLength && *pSource) { // look for the delimiter for(size_t i = 0; i < nSourceLength && *pSource; i++) { const char16_t cTemp(*pSource); // update new source pointer if present if(ppNewSource) (*ppNewSource)++; if(cTemp == cDelimiter) // If there is a delimiter match... break; // We are done. else { // keep moving characters into the token until we find the delimiter or reached the end of the token string if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null { *pToken = cTemp; // add the character pToken++; // increment the token pointer *pToken = 0; // insert terminating null character } pSource++; // increment source pointer } } return true; } return false; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenDelimited (32 bit version) // // Implemented by Blazej Stompel and Paul Pedriana // EASTDC_API bool SplitTokenDelimited(const char32_t* pSource, size_t nSourceLength, char32_t cDelimiter, char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = 0; if(pSource && nSourceLength && *pSource) { // look for the delimiter for(size_t i = 0; i < nSourceLength && *pSource; i++) { const char32_t cTemp(*pSource); // update new source pointer if present if(ppNewSource) (*ppNewSource)++; if(cTemp == cDelimiter) // If there is a delimiter match... break; // We are done. else { // keep moving characters into the token until we find the delimiter or reached the end of the token string if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null { *pToken = cTemp; // add the character pToken++; // increment the token pointer *pToken = 0; // insert terminating null character } pSource++; // increment source pointer } } return true; } return false; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenSeparated (8 bit version) // EASTDC_API bool SplitTokenSeparated(const char* pSource, size_t nSourceLength, char c, char* pToken, size_t nTokenLength, const char** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = '\0'; if(pSource) { // keep track of how many characters we have written to the token buffer size_t nTokenIndex = 0; // keep track whether we found the token and if we are done reading it bool bFoundToken = false; bool bReadToken = false; // look for the separators for(size_t i = 0; i < nSourceLength; i++) { // get the character const char cTemp(*pSource); // quit if we found the terminating null character if(cTemp != '\0') { // is the character not a separator ? if(cTemp != c) { // we have a token bFoundToken = true; // were we done reading the token ? if(bReadToken) return true; else { // add the character to the token if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null { // add the character *pToken = cTemp; // increment the token pointer pToken++; // and index nTokenIndex++; // insert terminating null character *pToken = '\0'; } } } else { // the character is a separator - if we found our token then we are done reading it if(bFoundToken) bReadToken = true; } // update new source pointer if present if(ppNewSource) (*ppNewSource)++; // increment source pointer pSource++; } else { // we have reached the end of the string break; } } return bFoundToken; } return false; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenSeparated (16 bit version) // // Implemented by Blazej Stompel // // Unit test can be found in Foundation\Test\UnitTests // EASTDC_API bool SplitTokenSeparated(const char16_t* pSource, size_t nSourceLength, char16_t c, char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = '\0'; if(pSource) { // keep track of how many characters we have written to the token buffer size_t nTokenIndex = 0; // keep track whether we found the token and if we are done reading it bool bFoundToken = false; bool bReadToken = false; // look for the separators for(size_t i = 0; i < nSourceLength; i++) { // get the character const char16_t cTemp(*pSource); // quit if we found the terminating null character if(cTemp != '\0') { // is the character not a separator ? if(cTemp != c) { // we have a token bFoundToken = true; // were we done reading the token ? if(bReadToken) return true; else { // add the character to the token if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null { // add the character *pToken = cTemp; // increment the token pointer pToken++; // and index nTokenIndex++; // insert terminating null character *pToken = '\0'; } } } else { // the character is a separator - if we found our token then we are done reading it if(bFoundToken) bReadToken = true; } // update new source pointer if present if(ppNewSource) (*ppNewSource)++; // increment source pointer pSource++; } else { // we have reached the end of the string break; } } return bFoundToken; } return false; } ////////////////////////////////////////////////////////////////////////////// // SplitTokenSeparated (32 bit version) // // Implemented by Blazej Stompel // // Unit test can be found in Foundation\Test\UnitTests // EASTDC_API bool SplitTokenSeparated(const char32_t* pSource, size_t nSourceLength, char32_t c, char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource) { // terminate the token (so it appears empty if we don't find anything) if(pToken && nTokenLength) *pToken = '\0'; if(pSource) { // keep track of how many characters we have written to the token buffer size_t nTokenIndex = 0; // keep track whether we found the token and if we are done reading it bool bFoundToken = false; bool bReadToken = false; // look for the separators for(size_t i = 0; i < nSourceLength; i++) { // get the character const char32_t cTemp(*pSource); // quit if we found the terminating null character if(cTemp != '\0') { // is the character not a separator ? if(cTemp != c) { // we have a token bFoundToken = true; // were we done reading the token ? if(bReadToken) return true; else { // add the character to the token if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null { // add the character *pToken = cTemp; // increment the token pointer pToken++; // and index nTokenIndex++; // insert terminating null character *pToken = '\0'; } } } else { // the character is a separator - if we found our token then we are done reading it if(bFoundToken) bReadToken = true; } // update new source pointer if present if(ppNewSource) (*ppNewSource)++; // increment source pointer pSource++; } else { // we have reached the end of the string break; } } return bFoundToken; } return false; } /////////////////////////////////////////////////////////////////////////////// // Boyer-Moore string search // // This is the "turbo" implementation defined at http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140. // Boyer-Moore is a very fast string search compared to most others, including // those in the STL. However, you need to be searching a string of at least 100 // chars and have a search pattern of at least 3 characters for the speed to show, // as Boyer-Moore has a startup precalculation that costs some cycles. // This startup precalculation is proportional to the size of your search pattern // and the size of the alphabet in use. Thus, doing Boyer-Moore searches on the // entire Unicode alphabet is going to incur a fairly expensive precalculation cost. // // This is a private function used by BoyerMooreSearch. // static void BoyerMooreBadCharacterCalc(const char* pPattern, int nPatternLength, int* pAlphabetBuffer, int nAlphabetBufferSize) { int i; for(i = 0; i < nAlphabetBufferSize; ++i) pAlphabetBuffer[i] = nPatternLength; for(i = 0; i < (nPatternLength - 1); ++i) pAlphabetBuffer[(int)pPattern[i]] = (nPatternLength - i) - 1; } // This is a private function used by BoyerMooreSearch. // static void BoyerMooreGoodSuffixCalc(const char* pPattern, int nPatternLength, int* pPatternBuffer1, int* pPatternBuffer2) { int i; int j = 0; int f = 0; int g = nPatternLength - 1; pPatternBuffer2[nPatternLength - 1] = nPatternLength; for(i = nPatternLength - 2; i >= 0; --i) { if((i > g) && pPatternBuffer2[((i + nPatternLength) - 1) - f] < (i - g)) pPatternBuffer2[i] = pPatternBuffer2[((i + nPatternLength) - 1) - f]; else { if(i < g) g = i; f = i; while((g >= 0) && (pPattern[g] == pPattern[((g + nPatternLength) - 1) - f])) --g; pPatternBuffer2[i] = f - g; } } for(i = 0; i < nPatternLength; ++i) pPatternBuffer1[i] = nPatternLength; for(i = nPatternLength - 1; i >= -1; --i) { if((i == -1) || (pPatternBuffer2[i] == (i + 1))) { for(; j < (nPatternLength - 1) - i; ++j) { if(pPatternBuffer1[j] == nPatternLength) pPatternBuffer1[j] = (nPatternLength - 1) - i; } } } for(i = 0; i <= nPatternLength - 2; ++i) pPatternBuffer1[(nPatternLength - 1) - pPatternBuffer2[i]] = (nPatternLength - 1) - i; } // Argument specification. // // patternBuffer1 is a user-supplied buffer and must be at least as long as the search pattern. // patternBuffer2 is a user-supplied buffer and must be at least as long as the search pattern. // alphabetBuffer is a user-supplied buffer and must be at least as long as the highest character value used in the searched string and search pattern. // EASTDC_API int BoyerMooreSearch(const char* pPattern, int nPatternLength, const char* pSearchString, int nSearchStringLength, int* pPatternBuffer1, int* pPatternBuffer2, int* pAlphabetBuffer, int nAlphabetBufferSize) { // Do precalculations BoyerMooreGoodSuffixCalc(pPattern, nPatternLength, pPatternBuffer1, pPatternBuffer2); BoyerMooreBadCharacterCalc(pPattern, nPatternLength, pAlphabetBuffer, nAlphabetBufferSize); // Do search for(int j = 0, shift = nPatternLength, u = 0; j <= (nSearchStringLength - nPatternLength); j += shift) { int i = nPatternLength - 1; while((i >= 0) && (pPattern[i] == pSearchString[i + j])) { --i; if((u != 0) && (i == (nPatternLength - 1) - shift)) i -= u; } if(i < 0) { return j; // Only used if we were iterating multiple found items: //shift = pPatternBuffer1[0]; //u = nPatternLength - shift; } else { const int v = nPatternLength - 1 - i; const int turboShift = u - v; const int bcShift = pAlphabetBuffer[(int)pSearchString[i + j]] - nPatternLength + 1 + i; shift = EATEXTUTIL_MAX(turboShift, bcShift); shift = EATEXTUTIL_MAX(shift, pPatternBuffer1[i]); if(shift == pPatternBuffer1[i]) u = EATEXTUTIL_MIN(nPatternLength - shift, v); else { if(turboShift < bcShift) shift = EATEXTUTIL_MAX(shift, u + 1); u = 0; } } } return nPatternLength; } #undef EATEXTUTIL_MIN #undef EATEXTUTIL_MAX } // namespace StdC } // namespace EA