| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840 |
- ///////////////////////////////////////////////////////////////////////////////
- // Copyright (c) Electronic Arts Inc. All rights reserved.
- ///////////////////////////////////////////////////////////////////////////////
- #include <EAStdC/internal/Config.h>
- #include <EAStdC/EATextUtil.h>
- #include <EAStdC/EAString.h>
- /////////////////////////////////////////////////////////////////////////////
- // EATEXTUTIL_MIN / EATEXTUTIL_MAX
- //
- #define EATEXTUTIL_MIN(a, b) ((a) < (b) ? (a) : (b))
- #define EATEXTUTIL_MAX(a, b) ((a) > (b) ? (a) : (b))
- namespace EA
- {
- namespace StdC
- {
- extern uint8_t utf8lengthTable[256];
- ///////////////////////////////////////////////////////////////////////////////
- // UTF8Validate
- //
- // There are multiple definitions of what a valid UTF8 string is. UTF8 allows
- // the ability to encode the same UTF16 character in multiple ways. This in
- // one sense is a legal UTF8 array. However, for some security reasons it is
- // sometimes considered that a UTF8 array is illegal (or at least 'unsafe')
- // if it encodes some character with more bytes than needed. Actually the
- // Unicode standard v3.0 says that these 'insecure' UTF8 sequences are
- // formally illegal to generate but not illegal to interpret.
- // See "http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html"
- //
- // We take the high-security approach here, though it is slower. We could write
- // a simpler function that does a non-security check with the simple table
- // of info here:
- // 0x00-0x7f are single standalone bytes.
- // 0xc2-0xFD are first byte of a multi-byte sequence.
- // 0xc2-0xdf are first byte of a pair.
- // 0xe0-0xef are first byte of a triplet.
- // 0x00-0xf7 are first byte of a quadruplet.
- // 0xf8-0xfb are first byte of a 5-tuplet.
- // 0xfc-0xfd are first byte of a 6-tuplet.
- // 0xfe-0xff are invalid bytes anywhere in a UTF8 string.
- // 0x80-0xbf are the second-sixth byte of a multi-byte sequence, though not all values are valid for all such bytes.
- //
- // See 'http://www.cl.cam.ac.uk/~mgk25/unicode.html' or search for "UTF8 FAQ"
- // on the Internet for more details on UTF8 and Unicode.
- //
- EASTDC_API bool UTF8Validate(const char8_t* pText, size_t nLength)
- {
- const uint8_t* pSource8 = (const uint8_t*)pText;
- const uint8_t* const pSource8End = pSource8 + nLength;
- while(pSource8 < pSource8End)
- {
- if(pSource8[0] < 0x80)
- ++pSource8;
- else if(pSource8[0] < 0xC2)
- break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- else if(pSource8[0] < 0xE0) // If 2 input chars result in 1 output char...
- {
- if(pSource8End - pSource8 >= 2)
- {
- if(!((pSource8[1] ^ 0x80) < 0x40))
- break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- pSource8 += 2;
- }
- else
- break; //The input string is not long enough to finish reading the current character.
- }
- else if(pSource8[0] < 0xF0) // If 3 input chars result in 1 output char...
- {
- if((pSource8End - pSource8) >= 3)
- {
- if(!(((pSource8[1] ^ 0x80) < 0x40) &&
- ((pSource8[2] ^ 0x80) < 0x40) &&
- (pSource8[0] >= 0xE1 || pSource8[1] >= 0xA0)))
- break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- pSource8 += 3;
- }
- else
- break; //The input string is not long enough to finish reading the current character.
- }
- else if(pSource8[0] < 0xF8) // If 4 input chars result in 1 output char...
- {
- if((pSource8End - pSource8) >= 4)
- {
- if(!(((pSource8[1] ^ 0x80) < 0x40) &&
- ((pSource8[2] ^ 0x80) < 0x40) &&
- ((pSource8[3] ^ 0x80) < 0x40) &&
- (pSource8[0] >= 0xF1 || pSource8[1] >= 0x90)))
- break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- pSource8 += 4;
- }
- else
- break; //The input string is not long enough to finish reading the current character.
- }
- else if(pSource8[0] < 0xFC) // If 5 input chars result in 1 output char...
- {
- if((pSource8End - pSource8) >= 5)
- {
- if(!(((pSource8[1] ^ 0x80) < 0x40) &&
- ((pSource8[2] ^ 0x80) < 0x40) &&
- ((pSource8[3] ^ 0x80) < 0x40) &&
- ((pSource8[4] ^ 0x80) < 0x40) &&
- (pSource8[0] >= 0xf9 || pSource8[1] >= 0x88)))
- break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- pSource8 += 5;
- }
- else
- break; //The input string is not long enough to finish reading the current character.
- }
- else if(pSource8[0] < 0xFE) // If 6 input chars result in 1 output char...
- {
- if((pSource8End - pSource8) >= 6)
- {
- if(!(((pSource8[1] ^ 0x80) < 0x40) &&
- ((pSource8[2] ^ 0x80) < 0x40) &&
- ((pSource8[3] ^ 0x80) < 0x40) &&
- ((pSource8[4] ^ 0x80) < 0x40) &&
- ((pSource8[5] ^ 0x80) < 0x40) &&
- (pSource8[0] >= 0xfd || pSource8[1] >= 0x84)))
- break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
- pSource8 += 6;
- }
- else
- break; //The input string is not long enough to finish reading the current character.
- }
- else //Else the current input char is invalid.
- break;
- }
- return (pSource8 == pSource8End); // The return value is OK if we successfully processed all characters.
- }
- // Returns the pointer p incremented by n multibyte characters.
- // The string must be a valid UTF8 string or else the behavior is undefined.
- // If the string is not known to be valid, then it should be first validated independently
- // or a validating version of this function should be used instead.
- EASTDC_API char8_t* UTF8Increment(const char8_t* p, size_t n)
- {
- while(n--)
- {
- // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
- const int c = (uint8_t)*p;
- if (c <= 0xc1) // Actually, any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
- p += 1;
- else if(c <= 0xdf)
- p += 2;
- else if(c <= 0xef)
- p += 3;
- else if(c <= 0xf7)
- p += 4;
- else if(c <= 0xfb)
- p += 5;
- else if(c <= 0xfd)
- p += 6;
- else
- p += 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
- }
- return (char8_t*)p;
- }
- // Returns the pointer p decremented by n multibyte characters.
- // The string must be decrementable by the given number of characters or else
- // the behavior becomes undefined.
- // The string must be a valid UTF8 string or else the behavior is undefined.
- // If the string is not known to be valid, then it should be first validated independently
- // or a validating version of this function should be used instead.
- EASTDC_API char8_t* UTF8Decrement(const char8_t* p, size_t n)
- {
- while(n)
- {
- if(!UTF8IsFollowByte(*--p))
- --n;
- }
- return (char8_t*)p;
- }
- // Returns number of Unicode characters are in the UTF8-encoded string.
- // Return value will be <= Strlen(pString).
- // The string p must be 0-terminated or the behavior of this function is undefined.
- // The string must be a valid UTF8 string or else the behavior is undefined.
- // If the string is not known to be valid, then it should be first validated independently
- // or a validating version of this function should be used instead.
- EASTDC_API size_t UTF8Length(const char8_t* p)
- {
- size_t n = 0;
- while(*p)
- {
- if((*p & 0xc0) != 0x80) // If this is a leading char...
- ++n;
- ++p;
- }
- return n;
- }
- // Returns number of characters that would be in a UTF8-encoded string.
- // Return value will be >= Strlen(pString).
- // The string p must be 0-terminated or the behavior of this function is undefined.
- EASTDC_API size_t UTF8Length(const char16_t* p)
- {
- size_t n = 0;
- uint32_t c;
- while((c = *p++) != 0)
- {
- if(c < 0x00000080)
- n += 1;
- else if(c < 0x00000800)
- n += 2;
- else // if(c < 0x00010000)
- n += 3;
- }
- return n;
- }
- // Returns number of characters that would be in a UTF8-encoded string.
- // Return value will be >= Strlen(pString).
- // The string p must be 0-terminated or the behavior of this function is undefined.
- // Assumes the input values are valid, else the return value will be wrong.
- EASTDC_API size_t UTF8Length(const char32_t* p)
- {
- size_t n = 0;
- uint32_t c;
- while((c = (uint32_t)*p++) != 0)
- {
- if(c < 0x00000080)
- n += 1;
- else if(c < 0x00000800)
- n += 2;
- else if(c < 0x00010000)
- n += 3;
- else if(c < 0x00200000)
- n += 4;
- else if(c < 0x04000000)
- n += 5;
- else if(c <= 0x7fffffff)
- n += 6;
- else
- n += 1; // Error
- }
- return n;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // UTF8CharSize
- //
- // Returns the byte length of the UTF8 multibyte char pointed to by p.
- // The input p must point to the beginning of a UTF8 multibyte sequence,
- // else the return value is 1.
- //
- // 0x00-0x80 are single bytes.
- // 0x81-0xc1 are invalid values for a leading UTF8 char.
- // 0xc2-0xdf are first byte of a pair.
- // 0xe0-0xef are first byte of a triplet.
- // 0xf0-0xf7 are first byte of a quadruplet.
- // 0xf8-0xfb are first byte of a 5-tuplet.
- // 0xfc-0xfd are first byte of a 6-tuplet.
- // 0xfe-0xff are invalid values for a leading UTF8 char.
- //
- EASTDC_API size_t UTF8CharSize(const char8_t* p)
- {
- // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
- const int c = (uint8_t)*p;
- if (c <= 0xc1) // Any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
- return 1;
- else if(c <= 0xdf)
- return 2;
- else if(c <= 0xef)
- return 3;
- else if(c <= 0xf7) // This refers to a unicode point > char16_t
- return 4;
- else if(c <= 0xfb) // This refers to a unicode point > char16_t
- return 5;
- else if(c <= 0xfd) // This refers to a unicode point > char16_t
- return 6;
- return 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
- }
- EASTDC_API size_t UTF8CharSize(char16_t c)
- {
- if(c < 0x00000080)
- return 1;
- else if(c < 0x00000800)
- return 2;
- else // if(c < 0x00010000)
- return 3;
- // The following would be used if the input was 32 bit instead of 16 bit.
- //else if(c < 0x00010000)
- // return 3;
- //else if(c < 0x00200000)
- // return 4;
- //else if(c < 0x04000000)
- // return 5;
- //else if(c <= 0x7fffffff)
- // return 6;
- //
- //return 1; // Error
- }
- EASTDC_API size_t UTF8CharSize(char32_t c)
- {
- if((uint32_t)c < 0x00000080)
- return 1;
- else if((uint32_t)c < 0x00000800)
- return 2;
- else if((uint32_t)c < 0x00010000)
- return 3;
- else if((uint32_t)c < 0x00200000)
- return 4;
- else if((uint32_t)c < 0x04000000)
- return 5;
- else if((uint32_t)c < 0x80000000)
- return 6;
-
- return 1; // Error
- }
- EASTDC_API char16_t UTF8ReadChar(const char8_t* p, const char8_t** ppEnd)
- {
- char16_t c = 0;
- const char8_t* pCurrent;
- uint8_t cChar0((uint8_t)*p), cChar1, cChar2, cChar3;
- //assert((cChar0 != 0xFE) && (cChar0 != 0xFF)); // No byte can contain 0xFE or 0xFF
- if(cChar0 < 0x80)
- {
- c = cChar0;
- pCurrent = p + 1;
- }
- else
- {
- //assert((cChar0 & 0xC0) == 0xC0); // The top two bits need to be equal to 1
- if((cChar0 & 0xE0) == 0xC0)
- {
- c = (char16_t)((cChar0 & 0x1F) << 6);
- cChar1 = static_cast<uint8_t>(p[1]);
- //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= cChar1 & 0x3F;
- //assert(c >= 0x0080 && c < 0x0800); // Check that we have the smallest coding
- pCurrent = p + 2;
- }
- else if((cChar0 & 0xF0) == 0xE0)
- {
- c = (char16_t)((cChar0 & 0xF) << 12);
- cChar1 = static_cast<uint8_t>(p[1]);
- //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= (cChar1 & 0x3F) << 6;
- cChar2 = static_cast<uint8_t>(p[2]);
- //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= cChar2 & 0x3F;
- //assert(c >= 0x00000800 && c < 0x00010000); // Check that we have the smallest coding
- pCurrent = p + 3;
- }
- else
- {
- //assert((cChar0 & 0xf8) == 0xf0); // We handle the unicode but not UCS-4
- c = (char16_t)((cChar0 & 0x7) << 18);
- cChar1 = static_cast<uint8_t>(p[1]);
- //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= (char16_t)((cChar1 & 0x3F) << 12);
- cChar2 = static_cast<uint8_t>(p[2]);
- //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= (cChar2 & 0x3F) << 6;
- cChar3 = static_cast<uint8_t>(p[3]);
- //assert((cChar3 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
- c |= cChar3 & 0x3F;
- //assert(c >= 0x00010000 && c <= 0x0010FFFF); // Check that we have the smallest coding, Unicode and not ucs-4
- pCurrent = p + 4;
- }
- }
- if(ppEnd)
- *ppEnd = pCurrent;
- return c;
- }
- // This function assumes that there is enough space at p to write the char.
- // At most three bytes are needed to write a char16_t value and 6 bytes are
- // needed to write a char32_t value.
- EASTDC_API char8_t* UTF8WriteChar(char8_t* p, char16_t c)
- {
- if(c < 0x80)
- {
- *p++ = (char8_t)(uint8_t)c;
- }
- else if(c < 0x0800)
- {
- *p++ = (char8_t)(uint8_t)((c >> 6) | 0xC0);
- *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- }
- else // if(c < 0x00010000)
- {
- *p++ = (char8_t)(uint8_t)((c >> 12) | 0xE0);
- *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
- *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- }
- //else
- //{
- // *p++ = (char8_t)(uint8_t)((c >> 18) | 0xF0);
- // *p++ = (char8_t)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
- // *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
- // *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- //}
- return p;
- }
- // This function assumes that there is enough space at p to write the char.
- // At most three bytes are needed to write a char32_t value and 6 bytes are
- // needed to write a char32_t value.
- EASTDC_API char8_t* UTF8WriteChar(char8_t* p, char32_t c)
- {
- if((uint32_t)c < 0x80)
- {
- *p++ = (char8_t)(uint8_t)c;
- }
- else if((uint32_t)c < 0x0800)
- {
- *p++ = (char8_t)(uint8_t)((c >> 6) | 0xC0);
- *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- }
- else if((uint32_t)c < 0x00010000)
- {
- *p++ = (char8_t)(uint8_t)((c >> 12) | 0xE0);
- *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
- *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- }
- else
- {
- *p++ = (char8_t)(uint8_t)((c >> 18) | 0xF0);
- *p++ = (char8_t)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
- *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
- *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
- }
- return p;
- }
- /// UTF8TrimPartialChar
- ///
- /// Trim the string to the last valid UTF8 character. This function has no effect on a UTF8 string that has
- /// entirely valid UTF8 content. It only trims the string if there is an incomplete UTF8 sequence at the
- /// end. The resulting string will always be a valid UTF8 string, whereas the input string may not be.
- /// Returns the strlen of the trimmed string.
- size_t UTF8TrimPartialChar(char8_t* pString, size_t nLength)
- {
- size_t validPos = 0;
- while(validPos < nLength)
- {
- uint8_t ch = (uint8_t)pString[validPos];
- size_t length = utf8lengthTable[ch];
-
- // length = 0 means invalid UTF8 marker
- if((length == 0) || ((validPos + length) > nLength))
- break;
- else
- validPos += length;
- }
- pString[validPos] = 0;
- return validPos;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // UTF8ReplaceInvalidChar
- //
- // This function replaces all invalidate UTF8 characters with the user provided
- // 8-bit replacement. The returned character array is guaranteed null-terminated.
- //
- EASTDC_API char8_t* UTF8ReplaceInvalidChar(const char8_t* pIn, size_t nLength, char8_t* pOut, char8_t replaceWith)
- {
- size_t validPos = 0;
- while(validPos < nLength)
- {
- uint8_t ch = (uint8_t)pIn[validPos];
- size_t length = utf8lengthTable[ch];
-
- // length = 0 means invalid UTF8 marker
- if((length == 0) || ((validPos + length) > nLength))
- {
- pOut[validPos++] = replaceWith;
- }
- else
- {
- for(auto i = validPos; i < validPos + length; i++)
- pOut[i] = pIn[i];
- validPos += length;
- }
- }
- pOut[validPos] = 0;
- return pOut + validPos;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // MatchPattern
- //
- // This function is recursively called on substrings.
- // Used by the WildcardMatch function.
- //
- template <class CharT>
- bool MatchPattern(const CharT* pElement, const CharT* pPattern)
- {
- if((*pPattern == (CharT)'*') && !pPattern[1])
- return true; // The pattern is set to match everything, so return true.
- else if(!*pElement && *pPattern)
- return false; // The element is empty but the pattern is not, so return false.
- else if(!*pElement)
- return true; // The element and pattern are both empty, so we are done. Return true.
- else
- {
- if(*pPattern == (CharT)'*')
- {
- if(MatchPattern(pElement, pPattern+1)) // What this section does is try to match source segments to
- return true; // the '*' portion of the pattern. As many parts of the source that
- else // can be assigned to the '*' portion of the pattern are done. If
- return MatchPattern(pElement+1, pPattern); // not possible, we pop out of the whole thing.
- }
- else if(*pPattern == (CharT)'?')
- return MatchPattern(pElement+1, pPattern+1); // The pattern accepts any character here, so move onto the next character.
- else
- {
- if(*pElement == *pPattern)
- return MatchPattern(pElement+1, pPattern+1); // The current element and pattern chars match, so move onto next character.
- else
- return false; // The current element char simply doesn't match the pattern char, so return false.
- }
- }
- // return true; // This should never get executed, but some compilers might not be smart enough to realize it.
- }
- ///////////////////////////////////////////////////////////////////////////////
- // WildcardMatch
- //
- // We go through extra effort below to avoid doing memory allocation in most cases.
- //
- EASTDC_API bool WildcardMatch(const char8_t* pString, const char8_t* pPattern, bool bCaseSensitive)
- {
- if(bCaseSensitive)
- return MatchPattern(pString, pPattern);
- else
- {
- // Do efficient string conversion to lower case...
- char8_t pStringLBuffer[384];
- char8_t* pStringL;
- char8_t* pStringLAllocated;
- size_t nStringLLength = Strlen(pString);
- if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
- {
- pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char[]") char[nStringLLength + 1];
- pStringL = pStringLAllocated;
- }
- else
- {
- pStringLAllocated = NULL;
- pStringL = pStringLBuffer;
- }
- Strcpy(pStringL, pString);
- Strlwr(pStringL);
- // Do efficient pattern conversion to lower case...
- char8_t pPatternLBuffer[32];
- char8_t* pPatternL;
- char8_t* pPatternLAllocated;
- size_t nPatternLLength = Strlen(pPattern);
- if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
- {
- pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char[]") char[nPatternLLength + 1];
- pPatternL = pPatternLAllocated;
- }
- else
- {
- pPatternLAllocated = NULL;
- pPatternL = pPatternLBuffer;
- }
- Strcpy(pPatternL, pPattern);
- Strlwr(pPatternL);
- const bool bResult = MatchPattern(pStringL, pPatternL);
- delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
- delete[] pPatternLAllocated;
- return bResult;
- }
- }
- ///////////////////////////////////////////////////////////////////////////////
- // WildcardMatch
- //
- // We go through extra effort below to avoid doing memory allocation in most cases.
- //
- EASTDC_API bool WildcardMatch(const char16_t* pString, const char16_t* pPattern, bool bCaseSensitive)
- {
- if(bCaseSensitive)
- return MatchPattern(pString, pPattern);
- else
- {
- // Do efficient string conversion to lower case...
- char16_t pStringLBuffer[384];
- char16_t* pStringL;
- char16_t* pStringLAllocated;
- size_t nStringLLength = Strlen(pString);
- if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
- {
- pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char16[]") char16_t[nStringLLength + 1];
- pStringL = pStringLAllocated;
- }
- else
- {
- pStringLAllocated = NULL;
- pStringL = pStringLBuffer;
- }
- Strcpy(pStringL, pString);
- Strlwr(pStringL);
- // Do efficient pattern conversion to lower case...
- char16_t pPatternLBuffer[32];
- char16_t* pPatternL;
- char16_t* pPatternLAllocated;
- size_t nPatternLLength = Strlen(pPattern);
- if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
- {
- pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char16[]") char16_t[nPatternLLength + 1];
- pPatternL = pPatternLAllocated;
- }
- else
- {
- pPatternLAllocated = NULL;
- pPatternL = pPatternLBuffer;
- }
- Strcpy(pPatternL, pPattern);
- Strlwr(pPatternL);
- const bool bResult = MatchPattern(pStringL, pPatternL);
- delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
- delete[] pPatternLAllocated;
- return bResult;
- }
- }
- ///////////////////////////////////////////////////////////////////////////////
- // WildcardMatch
- //
- // We go through extra effort below to avoid doing memory allocation in most cases.
- //
- EASTDC_API bool WildcardMatch(const char32_t* pString, const char32_t* pPattern, bool bCaseSensitive)
- {
- if(bCaseSensitive)
- return MatchPattern(pString, pPattern);
- else
- {
- // Do efficient string conversion to lower case...
- char32_t pStringLBuffer[384];
- char32_t* pStringL;
- char32_t* pStringLAllocated;
- size_t nStringLLength = Strlen(pString);
- if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
- {
- pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char32[]") char32_t[nStringLLength + 1];
- pStringL = pStringLAllocated;
- }
- else
- {
- pStringLAllocated = NULL;
- pStringL = pStringLBuffer;
- }
- Strcpy(pStringL, pString);
- Strlwr(pStringL);
- // Do efficient pattern conversion to lower case...
- char32_t pPatternLBuffer[32];
- char32_t* pPatternL;
- char32_t* pPatternLAllocated;
- size_t nPatternLLength = Strlen(pPattern);
- if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
- {
- pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char32[]") char32_t[nPatternLLength + 1];
- pPatternL = pPatternLAllocated;
- }
- else
- {
- pPatternLAllocated = NULL;
- pPatternL = pPatternLBuffer;
- }
- Strcpy(pPatternL, pPattern);
- Strlwr(pPatternL);
- const bool bResult = MatchPattern(pStringL, pPatternL);
- delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
- delete[] pPatternLAllocated;
- return bResult;
- }
- }
- //////////////////////////////////////////////////////////////////////////
- // GetTextLine
- //
- EASTDC_API const char8_t* GetTextLine(const char8_t* pText, const char8_t* pTextEnd, const char8_t** ppNewText)
- {
- if(pText < pTextEnd)
- {
- while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
- ++pText;
- if(ppNewText)
- {
- *ppNewText = pText;
- if(*ppNewText < pTextEnd)
- {
- if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
- ++*ppNewText;
- }
- }
- }
- else if(ppNewText)
- *ppNewText = pTextEnd;
- return pText;
- }
- //////////////////////////////////////////////////////////////////////////
- // GetTextLine
- //
- EASTDC_API const char16_t* GetTextLine(const char16_t* pText, const char16_t* pTextEnd, const char16_t** ppNewText)
- {
- if(pText < pTextEnd)
- {
- while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
- ++pText;
- if(ppNewText)
- {
- *ppNewText = pText;
- if(*ppNewText < pTextEnd)
- {
- if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
- ++*ppNewText;
- }
- }
- }
- else if(ppNewText)
- *ppNewText = pTextEnd;
- return pText;
- }
- //////////////////////////////////////////////////////////////////////////
- // GetTextLine
- //
- EASTDC_API const char32_t* GetTextLine(const char32_t* pText, const char32_t* pTextEnd, const char32_t** ppNewText)
- {
- if(pText < pTextEnd)
- {
- while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
- ++pText;
- if(ppNewText)
- {
- *ppNewText = pText;
- if(*ppNewText < pTextEnd)
- {
- if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
- ++*ppNewText;
- }
- }
- }
- else if(ppNewText)
- *ppNewText = pTextEnd;
- return pText;
- }
- EASTDC_API bool ParseDelimitedText(const char8_t* pText, const char8_t* pTextEnd, char8_t cDelimiter,
- const char8_t*& pToken, const char8_t*& pTokenEnd, const char8_t** ppNewText)
- {
- int nQuoteLevel = 0;
- bool bDelimiterFound = false;
- // We remove leading spaces.
- for(pToken = pText; pToken < pTextEnd; ++pToken)
- {
- if((*pToken != ' ') && (*pToken != '\t'))
- break;
- }
- for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
- {
- const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
- if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
- bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
- else
- bDelimiterFound = (*pTokenEnd == cDelimiter);
- if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
- {
- if(!bDelimiterFound)
- ++pTokenEnd;
- const bool bInQuotes = ((nQuoteLevel & 1) != 0);
- if(!bInQuotes || bLastCharacter) // If not within a quoted section...
- {
- if(ppNewText)
- *ppNewText = pTokenEnd;
- if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
- {
- // Eliminate spaces before the trailing delimiter.
- while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
- pTokenEnd--;
- }
- if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
- {
- pToken++;
- pTokenEnd--;
- }
- return true;
- }
- }
- else if(*pTokenEnd == '"')
- nQuoteLevel++;
- }
- if(ppNewText)
- *ppNewText = pTokenEnd;
- return false;
- }
- //////////////////////////////////////////////////////////////////////////
- // ParseDelimitedText
- //
- // This function takes a line text that has fields separated by delimiters
- // and parses the line into the component fields. It is common to read
- // command lines like this or to parse ini file settings like this.
- //
- EASTDC_API bool ParseDelimitedText(const char16_t* pText, const char16_t* pTextEnd, char16_t cDelimiter,
- const char16_t*& pToken, const char16_t*& pTokenEnd, const char16_t** ppNewText)
- {
- int nQuoteLevel = 0;
- bool bDelimiterFound = false;
- // We remove leading spaces.
- for(pToken = pText; pToken < pTextEnd; ++pToken)
- {
- if((*pToken != ' ') && (*pToken != '\t'))
- break;
- }
- for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
- {
- const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
- if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
- bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
- else
- bDelimiterFound = (*pTokenEnd == cDelimiter);
- if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
- {
- if(!bDelimiterFound)
- ++pTokenEnd;
- const bool bInQuotes = ((nQuoteLevel & 1) != 0);
- if(!bInQuotes || bLastCharacter) // If not within a quoted section...
- {
- if(ppNewText)
- *ppNewText = pTokenEnd;
- if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
- {
- // Eliminate spaces before the trailing delimiter.
- while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
- pTokenEnd--;
- }
- if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
- {
- pToken++;
- pTokenEnd--;
- }
- return true;
- }
- }
- else if(*pTokenEnd == '"')
- nQuoteLevel++;
- }
- if(ppNewText)
- *ppNewText = pTokenEnd;
- return false;
- }
- //////////////////////////////////////////////////////////////////////////
- // ParseDelimitedText
- //
- // This function takes a line text that has fields separated by delimiters
- // and parses the line into the component fields. It is common to read
- // command lines like this or to parse ini file settings like this.
- //
- EASTDC_API bool ParseDelimitedText(const char32_t* pText, const char32_t* pTextEnd, char32_t cDelimiter,
- const char32_t*& pToken, const char32_t*& pTokenEnd, const char32_t** ppNewText)
- {
- int nQuoteLevel = 0;
- bool bDelimiterFound = false;
- // We remove leading spaces.
- for(pToken = pText; pToken < pTextEnd; ++pToken)
- {
- if((*pToken != ' ') && (*pToken != '\t'))
- break;
- }
- for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
- {
- const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
- if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
- bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
- else
- bDelimiterFound = (*pTokenEnd == cDelimiter);
- if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
- {
- if(!bDelimiterFound)
- ++pTokenEnd;
- const bool bInQuotes = ((nQuoteLevel & 1) != 0);
- if(!bInQuotes || bLastCharacter) // If not within a quoted section...
- {
- if(ppNewText)
- *ppNewText = pTokenEnd;
- if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
- {
- // Eliminate spaces before the trailing delimiter.
- while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
- pTokenEnd--;
- }
- if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
- {
- pToken++;
- pTokenEnd--;
- }
- return true;
- }
- }
- else if(*pTokenEnd == '"')
- nQuoteLevel++;
- }
- if(ppNewText)
- *ppNewText = pTokenEnd;
- return false;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // ConvertBinaryDataToASCIIArray
- //
- // Since every binary byte converts to exactly 2 ascii bytes, the ASCII
- // array must have space for at least twice the amount of bytes
- // as 'nBinaryDataLength' + 1.
- //
- EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char8_t* pASCIIArray)
- {
- const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
- const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
- while(pBinaryData < pEnd)
- {
- *pASCIIArray = (char8_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- *pASCIIArray = (char8_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- pBinaryData++;
- }
- *pASCIIArray = '\0';
- }
- EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char16_t* pASCIIArray)
- {
- const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
- const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
- while(pBinaryData < pEnd)
- {
- *pASCIIArray = (char16_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- *pASCIIArray = (char16_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- pBinaryData++;
- }
- *pASCIIArray = '\0';
- }
- EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char32_t* pASCIIArray)
- {
- const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
- const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
- while(pBinaryData < pEnd)
- {
- *pASCIIArray = (char32_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- *pASCIIArray = (char32_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
- if(*pASCIIArray > '9')
- *pASCIIArray += 7; // Convert the ':' to 'A', for example.
- pASCIIArray++;
- pBinaryData++;
- }
- *pASCIIArray = '\0';
- }
- //////////////////////////////////////////////////////////////////////////////
- // ConvertASCIIArrayToBinaryData (8 bit version)
- //
- // We have a boolean return value because it is possible that the ascii data is
- // corrupt. We check for this corruption and return false if so, while converting
- // all corrupt bytes to valid ones.
- //
- EASTDC_API bool ConvertASCIIArrayToBinaryData(const char8_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
- {
- uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
- const char8_t* pEnd = pASCIIArray + nASCIIArrayLength;
- char8_t cTemp;
- bool bReturnValue(true);
- while(pASCIIArray < pEnd)
- {
- *pBinaryData8 = 0;
- for(int j = 4; j >= 0; j -= 4)
- {
- cTemp = *pASCIIArray;
- if(cTemp < '0') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp > 'F') // Do some bounds checking.
- {
- if(cTemp >= 'a' && cTemp <= 'f')
- cTemp -= 39; // Convert 'a' to ':'.
- else
- {
- cTemp = '0';
- bReturnValue = false;
- }
- }
- else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp >= 'A')
- cTemp -= 7;
- *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
- pASCIIArray++;
- }
- pBinaryData8++;
- }
- return bReturnValue;
- }
- //////////////////////////////////////////////////////////////////////////////
- // ConvertASCIIArrayToBinaryData (16 bit version)
- //
- // We have a boolean return value because it is possible that the ascii data is
- // corrupt. We check for this corruption and return false if so, while converting
- // all corrupt bytes to valid ones.
- //
- EASTDC_API bool ConvertASCIIArrayToBinaryData(const char16_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
- {
- uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
- const char16_t* pEnd = pASCIIArray + nASCIIArrayLength;
- char16_t cTemp;
- bool bReturnValue(true);
- while(pASCIIArray < pEnd)
- {
- *pBinaryData8 = 0;
- for(int j = 4; j >= 0; j -= 4)
- {
- cTemp = *pASCIIArray;
- if(cTemp < '0') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp > 'F') // Do some bounds checking.
- {
- if(cTemp >= 'a' && cTemp <= 'f')
- cTemp -= 39; // Convert 'a' to ':'.
- else
- {
- cTemp = '0';
- bReturnValue = false;
- }
- }
- else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp >= 'A')
- cTemp -= 7;
- *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
- pASCIIArray++;
- }
- pBinaryData8++;
- }
- return bReturnValue;
- }
- //////////////////////////////////////////////////////////////////////////////
- // ConvertASCIIArrayToBinaryData (32 bit version)
- //
- // We have a boolean return value because it is possible that the ascii data is
- // corrupt. We check for this corruption and return false if so, while converting
- // all corrupt bytes to valid ones.
- //
- EASTDC_API bool ConvertASCIIArrayToBinaryData(const char32_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
- {
- uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
- const char32_t* pEnd = pASCIIArray + nASCIIArrayLength;
- char32_t cTemp;
- bool bReturnValue(true);
- while(pASCIIArray < pEnd)
- {
- *pBinaryData8 = 0;
- for(int j = 4; j >= 0; j -= 4)
- {
- cTemp = *pASCIIArray;
- if(cTemp < '0') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp > 'F') // Do some bounds checking.
- {
- if(cTemp >= 'a' && cTemp <= 'f')
- cTemp -= 39; // Convert 'a' to ':'.
- else
- {
- cTemp = '0';
- bReturnValue = false;
- }
- }
- else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
- {
- cTemp = '0';
- bReturnValue = false;
- }
- else if(cTemp >= 'A')
- cTemp -= 7;
- *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
- pASCIIArray++;
- }
- pBinaryData8++;
- }
- return bReturnValue;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenDelimited (8 bit version)
- //
- EASTDC_API bool SplitTokenDelimited(const char8_t* pSource, size_t nSourceLength, char8_t cDelimiter,
- char8_t* pToken, size_t nTokenLength, const char8_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = 0;
- if(pSource && nSourceLength && *pSource)
- {
- // look for the delimiter
- for(size_t i = 0; i < nSourceLength && *pSource; i++)
- {
- const char8_t cTemp(*pSource);
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- if(cTemp == cDelimiter) // If there is a delimiter match...
- break; // We are done.
- else
- {
- // keep moving characters into the token until we find the delimiter or reached the end of the token string
- if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
- {
- *pToken = cTemp; // add the character
- pToken++; // increment the token pointer
- *pToken = 0; // insert terminating null character
- }
- pSource++; // increment source pointer
- }
- }
- return true;
- }
- return false;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenDelimited (16 bit version)
- //
- // Implemented by Blazej Stompel and Paul Pedriana
- //
- EASTDC_API bool SplitTokenDelimited(const char16_t* pSource, size_t nSourceLength, char16_t cDelimiter,
- char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = 0;
- if(pSource && nSourceLength && *pSource)
- {
- // look for the delimiter
- for(size_t i = 0; i < nSourceLength && *pSource; i++)
- {
- const char16_t cTemp(*pSource);
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- if(cTemp == cDelimiter) // If there is a delimiter match...
- break; // We are done.
- else
- {
- // keep moving characters into the token until we find the delimiter or reached the end of the token string
- if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
- {
- *pToken = cTemp; // add the character
- pToken++; // increment the token pointer
- *pToken = 0; // insert terminating null character
- }
- pSource++; // increment source pointer
- }
- }
- return true;
- }
- return false;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenDelimited (32 bit version)
- //
- // Implemented by Blazej Stompel and Paul Pedriana
- //
- EASTDC_API bool SplitTokenDelimited(const char32_t* pSource, size_t nSourceLength, char32_t cDelimiter,
- char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = 0;
- if(pSource && nSourceLength && *pSource)
- {
- // look for the delimiter
- for(size_t i = 0; i < nSourceLength && *pSource; i++)
- {
- const char32_t cTemp(*pSource);
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- if(cTemp == cDelimiter) // If there is a delimiter match...
- break; // We are done.
- else
- {
- // keep moving characters into the token until we find the delimiter or reached the end of the token string
- if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
- {
- *pToken = cTemp; // add the character
- pToken++; // increment the token pointer
- *pToken = 0; // insert terminating null character
- }
- pSource++; // increment source pointer
- }
- }
- return true;
- }
- return false;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenSeparated (8 bit version)
- //
- EASTDC_API bool SplitTokenSeparated(const char8_t* pSource, size_t nSourceLength, char8_t c,
- char8_t* pToken, size_t nTokenLength, const char8_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = '\0';
- if(pSource)
- {
- // keep track of how many characters we have written to the token buffer
- size_t nTokenIndex = 0;
- // keep track whether we found the token and if we are done reading it
- bool bFoundToken = false;
- bool bReadToken = false;
- // look for the separators
- for(size_t i = 0; i < nSourceLength; i++)
- {
- // get the character
- const char8_t cTemp(*pSource);
- // quit if we found the terminating null character
- if(cTemp != '\0')
- {
- // is the character not a separator ?
- if(cTemp != c)
- {
- // we have a token
- bFoundToken = true;
- // were we done reading the token ?
- if(bReadToken)
- return true;
- else
- {
- // add the character to the token
- if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
- {
- // add the character
- *pToken = cTemp;
- // increment the token pointer
- pToken++;
- // and index
- nTokenIndex++;
- // insert terminating null character
- *pToken = '\0';
- }
- }
- }
- else
- {
- // the character is a separator - if we found our token then we are done reading it
- if(bFoundToken)
- bReadToken = true;
- }
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- // increment source pointer
- pSource++;
- }
- else
- {
- // we have reached the end of the string
- break;
- }
- }
- return bFoundToken;
- }
- return false;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenSeparated (16 bit version)
- //
- // Implemented by Blazej Stompel
- //
- // Unit test can be found in Foundation\Test\UnitTests
- //
- EASTDC_API bool SplitTokenSeparated(const char16_t* pSource, size_t nSourceLength, char16_t c,
- char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = '\0';
- if(pSource)
- {
- // keep track of how many characters we have written to the token buffer
- size_t nTokenIndex = 0;
- // keep track whether we found the token and if we are done reading it
- bool bFoundToken = false;
- bool bReadToken = false;
- // look for the separators
- for(size_t i = 0; i < nSourceLength; i++)
- {
- // get the character
- const char16_t cTemp(*pSource);
- // quit if we found the terminating null character
- if(cTemp != '\0')
- {
- // is the character not a separator ?
- if(cTemp != c)
- {
- // we have a token
- bFoundToken = true;
- // were we done reading the token ?
- if(bReadToken)
- return true;
- else
- {
- // add the character to the token
- if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
- {
- // add the character
- *pToken = cTemp;
- // increment the token pointer
- pToken++;
- // and index
- nTokenIndex++;
- // insert terminating null character
- *pToken = '\0';
- }
- }
- }
- else
- {
- // the character is a separator - if we found our token then we are done reading it
- if(bFoundToken)
- bReadToken = true;
- }
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- // increment source pointer
- pSource++;
- }
- else
- {
- // we have reached the end of the string
- break;
- }
- }
- return bFoundToken;
- }
- return false;
- }
- //////////////////////////////////////////////////////////////////////////////
- // SplitTokenSeparated (32 bit version)
- //
- // Implemented by Blazej Stompel
- //
- // Unit test can be found in Foundation\Test\UnitTests
- //
- EASTDC_API bool SplitTokenSeparated(const char32_t* pSource, size_t nSourceLength, char32_t c,
- char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
- {
- // terminate the token (so it appears empty if we don't find anything)
- if(pToken && nTokenLength)
- *pToken = '\0';
- if(pSource)
- {
- // keep track of how many characters we have written to the token buffer
- size_t nTokenIndex = 0;
- // keep track whether we found the token and if we are done reading it
- bool bFoundToken = false;
- bool bReadToken = false;
- // look for the separators
- for(size_t i = 0; i < nSourceLength; i++)
- {
- // get the character
- const char32_t cTemp(*pSource);
- // quit if we found the terminating null character
- if(cTemp != '\0')
- {
- // is the character not a separator ?
- if(cTemp != c)
- {
- // we have a token
- bFoundToken = true;
- // were we done reading the token ?
- if(bReadToken)
- return true;
- else
- {
- // add the character to the token
- if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
- {
- // add the character
- *pToken = cTemp;
- // increment the token pointer
- pToken++;
- // and index
- nTokenIndex++;
- // insert terminating null character
- *pToken = '\0';
- }
- }
- }
- else
- {
- // the character is a separator - if we found our token then we are done reading it
- if(bFoundToken)
- bReadToken = true;
- }
- // update new source pointer if present
- if(ppNewSource)
- (*ppNewSource)++;
- // increment source pointer
- pSource++;
- }
- else
- {
- // we have reached the end of the string
- break;
- }
- }
- return bFoundToken;
- }
- return false;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // Boyer-Moore string search
- //
- // This is the "turbo" implementation defined at http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140.
- // Boyer-Moore is a very fast string search compared to most others, including
- // those in the STL. However, you need to be searching a string of at least 100
- // chars and have a search pattern of at least 3 characters for the speed to show,
- // as Boyer-Moore has a startup precalculation that costs some cycles.
- // This startup precalculation is proportional to the size of your search pattern
- // and the size of the alphabet in use. Thus, doing Boyer-Moore searches on the
- // entire Unicode alphabet is going to incur a fairly expensive precalculation cost.
- //
- // This is a private function used by BoyerMooreSearch.
- //
- static void BoyerMooreBadCharacterCalc(const char* pPattern, int nPatternLength,
- int* pAlphabetBuffer, int nAlphabetBufferSize)
- {
- int i;
-
- for(i = 0; i < nAlphabetBufferSize; ++i)
- pAlphabetBuffer[i] = nPatternLength;
- for(i = 0; i < (nPatternLength - 1); ++i)
- pAlphabetBuffer[(int)pPattern[i]] = (nPatternLength - i) - 1;
- }
- // This is a private function used by BoyerMooreSearch.
- //
- static void BoyerMooreGoodSuffixCalc(const char* pPattern, int nPatternLength,
- int* pPatternBuffer1, int* pPatternBuffer2)
- {
- int i;
- int j = 0;
- int f = 0;
- int g = nPatternLength - 1;
- pPatternBuffer2[nPatternLength - 1] = nPatternLength;
- for(i = nPatternLength - 2; i >= 0; --i)
- {
- if((i > g) && pPatternBuffer2[((i + nPatternLength) - 1) - f] < (i - g))
- pPatternBuffer2[i] = pPatternBuffer2[((i + nPatternLength) - 1) - f];
- else
- {
- if(i < g)
- g = i;
- f = i;
- while((g >= 0) && (pPattern[g] == pPattern[((g + nPatternLength) - 1) - f]))
- --g;
- pPatternBuffer2[i] = f - g;
- }
- }
- for(i = 0; i < nPatternLength; ++i)
- pPatternBuffer1[i] = nPatternLength;
- for(i = nPatternLength - 1; i >= -1; --i)
- {
- if((i == -1) || (pPatternBuffer2[i] == (i + 1)))
- {
- for(; j < (nPatternLength - 1) - i; ++j)
- {
- if(pPatternBuffer1[j] == nPatternLength)
- pPatternBuffer1[j] = (nPatternLength - 1) - i;
- }
- }
- }
- for(i = 0; i <= nPatternLength - 2; ++i)
- pPatternBuffer1[(nPatternLength - 1) - pPatternBuffer2[i]] = (nPatternLength - 1) - i;
- }
- // Argument specification.
- //
- // patternBuffer1 is a user-supplied buffer and must be at least as long as the search pattern.
- // patternBuffer2 is a user-supplied buffer and must be at least as long as the search pattern.
- // alphabetBuffer is a user-supplied buffer and must be at least as long as the highest character value used in the searched string and search pattern.
- //
- EASTDC_API int BoyerMooreSearch(const char* pPattern, int nPatternLength, const char* pSearchString, int nSearchStringLength,
- int* pPatternBuffer1, int* pPatternBuffer2, int* pAlphabetBuffer, int nAlphabetBufferSize)
- {
- // Do precalculations
- BoyerMooreGoodSuffixCalc(pPattern, nPatternLength, pPatternBuffer1, pPatternBuffer2);
- BoyerMooreBadCharacterCalc(pPattern, nPatternLength, pAlphabetBuffer, nAlphabetBufferSize);
- // Do search
- for(int j = 0, shift = nPatternLength, u = 0; j <= (nSearchStringLength - nPatternLength); j += shift)
- {
- int i = nPatternLength - 1;
- while((i >= 0) && (pPattern[i] == pSearchString[i + j]))
- {
- --i;
- if((u != 0) && (i == (nPatternLength - 1) - shift))
- i -= u;
- }
- if(i < 0)
- {
- return j;
- // Only used if we were iterating multiple found items:
- //shift = pPatternBuffer1[0];
- //u = nPatternLength - shift;
- }
- else
- {
- const int v = nPatternLength - 1 - i;
- const int turboShift = u - v;
- const int bcShift = pAlphabetBuffer[(int)pSearchString[i + j]] - nPatternLength + 1 + i;
- shift = EATEXTUTIL_MAX(turboShift, bcShift);
- shift = EATEXTUTIL_MAX(shift, pPatternBuffer1[i]);
- if(shift == pPatternBuffer1[i])
- u = EATEXTUTIL_MIN(nPatternLength - shift, v);
- else
- {
- if(turboShift < bcShift)
- shift = EATEXTUTIL_MAX(shift, u + 1);
- u = 0;
- }
- }
- }
- return nPatternLength;
- }
- #undef EATEXTUTIL_MIN
- #undef EATEXTUTIL_MAX
- } // namespace StdC
- } // namespace EA
|