EATextUtil.cpp 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // Copyright (c) Electronic Arts Inc. All rights reserved.
  3. ///////////////////////////////////////////////////////////////////////////////
  4. #include <EAStdC/internal/Config.h>
  5. #include <EAStdC/EATextUtil.h>
  6. #include <EAStdC/EAString.h>
  7. /////////////////////////////////////////////////////////////////////////////
  8. // EATEXTUTIL_MIN / EATEXTUTIL_MAX
  9. //
  10. #define EATEXTUTIL_MIN(a, b) ((a) < (b) ? (a) : (b))
  11. #define EATEXTUTIL_MAX(a, b) ((a) > (b) ? (a) : (b))
  12. namespace EA
  13. {
  14. namespace StdC
  15. {
  16. extern uint8_t utf8lengthTable[256];
  17. ///////////////////////////////////////////////////////////////////////////////
  18. // UTF8Validate
  19. //
  20. // There are multiple definitions of what a valid UTF8 string is. UTF8 allows
  21. // the ability to encode the same UTF16 character in multiple ways. This in
  22. // one sense is a legal UTF8 array. However, for some security reasons it is
  23. // sometimes considered that a UTF8 array is illegal (or at least 'unsafe')
  24. // if it encodes some character with more bytes than needed. Actually the
  25. // Unicode standard v3.0 says that these 'insecure' UTF8 sequences are
  26. // formally illegal to generate but not illegal to interpret.
  27. // See "http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html"
  28. //
  29. // We take the high-security approach here, though it is slower. We could write
  30. // a simpler function that does a non-security check with the simple table
  31. // of info here:
  32. // 0x00-0x7f are single standalone bytes.
  33. // 0xc2-0xFD are first byte of a multi-byte sequence.
  34. // 0xc2-0xdf are first byte of a pair.
  35. // 0xe0-0xef are first byte of a triplet.
  36. // 0x00-0xf7 are first byte of a quadruplet.
  37. // 0xf8-0xfb are first byte of a 5-tuplet.
  38. // 0xfc-0xfd are first byte of a 6-tuplet.
  39. // 0xfe-0xff are invalid bytes anywhere in a UTF8 string.
  40. // 0x80-0xbf are the second-sixth byte of a multi-byte sequence, though not all values are valid for all such bytes.
  41. //
  42. // See 'http://www.cl.cam.ac.uk/~mgk25/unicode.html' or search for "UTF8 FAQ"
  43. // on the Internet for more details on UTF8 and Unicode.
  44. //
  45. EASTDC_API bool UTF8Validate(const char8_t* pText, size_t nLength)
  46. {
  47. const uint8_t* pSource8 = (const uint8_t*)pText;
  48. const uint8_t* const pSource8End = pSource8 + nLength;
  49. while(pSource8 < pSource8End)
  50. {
  51. if(pSource8[0] < 0x80)
  52. ++pSource8;
  53. else if(pSource8[0] < 0xC2)
  54. break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  55. else if(pSource8[0] < 0xE0) // If 2 input chars result in 1 output char...
  56. {
  57. if(pSource8End - pSource8 >= 2)
  58. {
  59. if(!((pSource8[1] ^ 0x80) < 0x40))
  60. break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  61. pSource8 += 2;
  62. }
  63. else
  64. break; //The input string is not long enough to finish reading the current character.
  65. }
  66. else if(pSource8[0] < 0xF0) // If 3 input chars result in 1 output char...
  67. {
  68. if((pSource8End - pSource8) >= 3)
  69. {
  70. if(!(((pSource8[1] ^ 0x80) < 0x40) &&
  71. ((pSource8[2] ^ 0x80) < 0x40) &&
  72. (pSource8[0] >= 0xE1 || pSource8[1] >= 0xA0)))
  73. break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  74. pSource8 += 3;
  75. }
  76. else
  77. break; //The input string is not long enough to finish reading the current character.
  78. }
  79. else if(pSource8[0] < 0xF8) // If 4 input chars result in 1 output char...
  80. {
  81. if((pSource8End - pSource8) >= 4)
  82. {
  83. if(!(((pSource8[1] ^ 0x80) < 0x40) &&
  84. ((pSource8[2] ^ 0x80) < 0x40) &&
  85. ((pSource8[3] ^ 0x80) < 0x40) &&
  86. (pSource8[0] >= 0xF1 || pSource8[1] >= 0x90)))
  87. break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  88. pSource8 += 4;
  89. }
  90. else
  91. break; //The input string is not long enough to finish reading the current character.
  92. }
  93. else if(pSource8[0] < 0xFC) // If 5 input chars result in 1 output char...
  94. {
  95. if((pSource8End - pSource8) >= 5)
  96. {
  97. if(!(((pSource8[1] ^ 0x80) < 0x40) &&
  98. ((pSource8[2] ^ 0x80) < 0x40) &&
  99. ((pSource8[3] ^ 0x80) < 0x40) &&
  100. ((pSource8[4] ^ 0x80) < 0x40) &&
  101. (pSource8[0] >= 0xf9 || pSource8[1] >= 0x88)))
  102. break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  103. pSource8 += 5;
  104. }
  105. else
  106. break; //The input string is not long enough to finish reading the current character.
  107. }
  108. else if(pSource8[0] < 0xFE) // If 6 input chars result in 1 output char...
  109. {
  110. if((pSource8End - pSource8) >= 6)
  111. {
  112. if(!(((pSource8[1] ^ 0x80) < 0x40) &&
  113. ((pSource8[2] ^ 0x80) < 0x40) &&
  114. ((pSource8[3] ^ 0x80) < 0x40) &&
  115. ((pSource8[4] ^ 0x80) < 0x40) &&
  116. ((pSource8[5] ^ 0x80) < 0x40) &&
  117. (pSource8[0] >= 0xfd || pSource8[1] >= 0x84)))
  118. break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
  119. pSource8 += 6;
  120. }
  121. else
  122. break; //The input string is not long enough to finish reading the current character.
  123. }
  124. else //Else the current input char is invalid.
  125. break;
  126. }
  127. return (pSource8 == pSource8End); // The return value is OK if we successfully processed all characters.
  128. }
  129. // Returns the pointer p incremented by n multibyte characters.
  130. // The string must be a valid UTF8 string or else the behavior is undefined.
  131. // If the string is not known to be valid, then it should be first validated independently
  132. // or a validating version of this function should be used instead.
  133. EASTDC_API char8_t* UTF8Increment(const char8_t* p, size_t n)
  134. {
  135. while(n--)
  136. {
  137. // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
  138. const int c = (uint8_t)*p;
  139. if (c <= 0xc1) // Actually, any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
  140. p += 1;
  141. else if(c <= 0xdf)
  142. p += 2;
  143. else if(c <= 0xef)
  144. p += 3;
  145. else if(c <= 0xf7)
  146. p += 4;
  147. else if(c <= 0xfb)
  148. p += 5;
  149. else if(c <= 0xfd)
  150. p += 6;
  151. else
  152. p += 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
  153. }
  154. return (char8_t*)p;
  155. }
  156. // Returns the pointer p decremented by n multibyte characters.
  157. // The string must be decrementable by the given number of characters or else
  158. // the behavior becomes undefined.
  159. // The string must be a valid UTF8 string or else the behavior is undefined.
  160. // If the string is not known to be valid, then it should be first validated independently
  161. // or a validating version of this function should be used instead.
  162. EASTDC_API char8_t* UTF8Decrement(const char8_t* p, size_t n)
  163. {
  164. while(n)
  165. {
  166. if(!UTF8IsFollowByte(*--p))
  167. --n;
  168. }
  169. return (char8_t*)p;
  170. }
  171. // Returns number of Unicode characters are in the UTF8-encoded string.
  172. // Return value will be <= Strlen(pString).
  173. // The string p must be 0-terminated or the behavior of this function is undefined.
  174. // The string must be a valid UTF8 string or else the behavior is undefined.
  175. // If the string is not known to be valid, then it should be first validated independently
  176. // or a validating version of this function should be used instead.
  177. EASTDC_API size_t UTF8Length(const char8_t* p)
  178. {
  179. size_t n = 0;
  180. while(*p)
  181. {
  182. if((*p & 0xc0) != 0x80) // If this is a leading char...
  183. ++n;
  184. ++p;
  185. }
  186. return n;
  187. }
  188. // Returns number of characters that would be in a UTF8-encoded string.
  189. // Return value will be >= Strlen(pString).
  190. // The string p must be 0-terminated or the behavior of this function is undefined.
  191. EASTDC_API size_t UTF8Length(const char16_t* p)
  192. {
  193. size_t n = 0;
  194. uint32_t c;
  195. while((c = *p++) != 0)
  196. {
  197. if(c < 0x00000080)
  198. n += 1;
  199. else if(c < 0x00000800)
  200. n += 2;
  201. else // if(c < 0x00010000)
  202. n += 3;
  203. }
  204. return n;
  205. }
  206. // Returns number of characters that would be in a UTF8-encoded string.
  207. // Return value will be >= Strlen(pString).
  208. // The string p must be 0-terminated or the behavior of this function is undefined.
  209. // Assumes the input values are valid, else the return value will be wrong.
  210. EASTDC_API size_t UTF8Length(const char32_t* p)
  211. {
  212. size_t n = 0;
  213. uint32_t c;
  214. while((c = (uint32_t)*p++) != 0)
  215. {
  216. if(c < 0x00000080)
  217. n += 1;
  218. else if(c < 0x00000800)
  219. n += 2;
  220. else if(c < 0x00010000)
  221. n += 3;
  222. else if(c < 0x00200000)
  223. n += 4;
  224. else if(c < 0x04000000)
  225. n += 5;
  226. else if(c <= 0x7fffffff)
  227. n += 6;
  228. else
  229. n += 1; // Error
  230. }
  231. return n;
  232. }
  233. ///////////////////////////////////////////////////////////////////////////////
  234. // UTF8CharSize
  235. //
  236. // Returns the byte length of the UTF8 multibyte char pointed to by p.
  237. // The input p must point to the beginning of a UTF8 multibyte sequence,
  238. // else the return value is 1.
  239. //
  240. // 0x00-0x80 are single bytes.
  241. // 0x81-0xc1 are invalid values for a leading UTF8 char.
  242. // 0xc2-0xdf are first byte of a pair.
  243. // 0xe0-0xef are first byte of a triplet.
  244. // 0xf0-0xf7 are first byte of a quadruplet.
  245. // 0xf8-0xfb are first byte of a 5-tuplet.
  246. // 0xfc-0xfd are first byte of a 6-tuplet.
  247. // 0xfe-0xff are invalid values for a leading UTF8 char.
  248. //
  249. EASTDC_API size_t UTF8CharSize(const char8_t* p)
  250. {
  251. // To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
  252. const int c = (uint8_t)*p;
  253. if (c <= 0xc1) // Any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
  254. return 1;
  255. else if(c <= 0xdf)
  256. return 2;
  257. else if(c <= 0xef)
  258. return 3;
  259. else if(c <= 0xf7) // This refers to a unicode point > char16_t
  260. return 4;
  261. else if(c <= 0xfb) // This refers to a unicode point > char16_t
  262. return 5;
  263. else if(c <= 0xfd) // This refers to a unicode point > char16_t
  264. return 6;
  265. return 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
  266. }
  267. EASTDC_API size_t UTF8CharSize(char16_t c)
  268. {
  269. if(c < 0x00000080)
  270. return 1;
  271. else if(c < 0x00000800)
  272. return 2;
  273. else // if(c < 0x00010000)
  274. return 3;
  275. // The following would be used if the input was 32 bit instead of 16 bit.
  276. //else if(c < 0x00010000)
  277. // return 3;
  278. //else if(c < 0x00200000)
  279. // return 4;
  280. //else if(c < 0x04000000)
  281. // return 5;
  282. //else if(c <= 0x7fffffff)
  283. // return 6;
  284. //
  285. //return 1; // Error
  286. }
  287. EASTDC_API size_t UTF8CharSize(char32_t c)
  288. {
  289. if((uint32_t)c < 0x00000080)
  290. return 1;
  291. else if((uint32_t)c < 0x00000800)
  292. return 2;
  293. else if((uint32_t)c < 0x00010000)
  294. return 3;
  295. else if((uint32_t)c < 0x00200000)
  296. return 4;
  297. else if((uint32_t)c < 0x04000000)
  298. return 5;
  299. else if((uint32_t)c < 0x80000000)
  300. return 6;
  301. return 1; // Error
  302. }
  303. EASTDC_API char16_t UTF8ReadChar(const char8_t* p, const char8_t** ppEnd)
  304. {
  305. char16_t c = 0;
  306. const char8_t* pCurrent;
  307. uint8_t cChar0((uint8_t)*p), cChar1, cChar2, cChar3;
  308. //assert((cChar0 != 0xFE) && (cChar0 != 0xFF)); // No byte can contain 0xFE or 0xFF
  309. if(cChar0 < 0x80)
  310. {
  311. c = cChar0;
  312. pCurrent = p + 1;
  313. }
  314. else
  315. {
  316. //assert((cChar0 & 0xC0) == 0xC0); // The top two bits need to be equal to 1
  317. if((cChar0 & 0xE0) == 0xC0)
  318. {
  319. c = (char16_t)((cChar0 & 0x1F) << 6);
  320. cChar1 = static_cast<uint8_t>(p[1]);
  321. //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  322. c |= cChar1 & 0x3F;
  323. //assert(c >= 0x0080 && c < 0x0800); // Check that we have the smallest coding
  324. pCurrent = p + 2;
  325. }
  326. else if((cChar0 & 0xF0) == 0xE0)
  327. {
  328. c = (char16_t)((cChar0 & 0xF) << 12);
  329. cChar1 = static_cast<uint8_t>(p[1]);
  330. //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  331. c |= (cChar1 & 0x3F) << 6;
  332. cChar2 = static_cast<uint8_t>(p[2]);
  333. //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  334. c |= cChar2 & 0x3F;
  335. //assert(c >= 0x00000800 && c < 0x00010000); // Check that we have the smallest coding
  336. pCurrent = p + 3;
  337. }
  338. else
  339. {
  340. //assert((cChar0 & 0xf8) == 0xf0); // We handle the unicode but not UCS-4
  341. c = (char16_t)((cChar0 & 0x7) << 18);
  342. cChar1 = static_cast<uint8_t>(p[1]);
  343. //assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  344. c |= (char16_t)((cChar1 & 0x3F) << 12);
  345. cChar2 = static_cast<uint8_t>(p[2]);
  346. //assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  347. c |= (cChar2 & 0x3F) << 6;
  348. cChar3 = static_cast<uint8_t>(p[3]);
  349. //assert((cChar3 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
  350. c |= cChar3 & 0x3F;
  351. //assert(c >= 0x00010000 && c <= 0x0010FFFF); // Check that we have the smallest coding, Unicode and not ucs-4
  352. pCurrent = p + 4;
  353. }
  354. }
  355. if(ppEnd)
  356. *ppEnd = pCurrent;
  357. return c;
  358. }
  359. // This function assumes that there is enough space at p to write the char.
  360. // At most three bytes are needed to write a char16_t value and 6 bytes are
  361. // needed to write a char32_t value.
  362. EASTDC_API char8_t* UTF8WriteChar(char8_t* p, char16_t c)
  363. {
  364. if(c < 0x80)
  365. {
  366. *p++ = (char8_t)(uint8_t)c;
  367. }
  368. else if(c < 0x0800)
  369. {
  370. *p++ = (char8_t)(uint8_t)((c >> 6) | 0xC0);
  371. *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  372. }
  373. else // if(c < 0x00010000)
  374. {
  375. *p++ = (char8_t)(uint8_t)((c >> 12) | 0xE0);
  376. *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
  377. *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  378. }
  379. //else
  380. //{
  381. // *p++ = (char8_t)(uint8_t)((c >> 18) | 0xF0);
  382. // *p++ = (char8_t)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
  383. // *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
  384. // *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  385. //}
  386. return p;
  387. }
  388. // This function assumes that there is enough space at p to write the char.
  389. // At most three bytes are needed to write a char32_t value and 6 bytes are
  390. // needed to write a char32_t value.
  391. EASTDC_API char8_t* UTF8WriteChar(char8_t* p, char32_t c)
  392. {
  393. if((uint32_t)c < 0x80)
  394. {
  395. *p++ = (char8_t)(uint8_t)c;
  396. }
  397. else if((uint32_t)c < 0x0800)
  398. {
  399. *p++ = (char8_t)(uint8_t)((c >> 6) | 0xC0);
  400. *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  401. }
  402. else if((uint32_t)c < 0x00010000)
  403. {
  404. *p++ = (char8_t)(uint8_t)((c >> 12) | 0xE0);
  405. *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
  406. *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  407. }
  408. else
  409. {
  410. *p++ = (char8_t)(uint8_t)((c >> 18) | 0xF0);
  411. *p++ = (char8_t)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
  412. *p++ = (char8_t)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
  413. *p++ = (char8_t)(uint8_t)((c & 0x3F) | 0x80);
  414. }
  415. return p;
  416. }
  417. /// UTF8TrimPartialChar
  418. ///
  419. /// Trim the string to the last valid UTF8 character. This function has no effect on a UTF8 string that has
  420. /// entirely valid UTF8 content. It only trims the string if there is an incomplete UTF8 sequence at the
  421. /// end. The resulting string will always be a valid UTF8 string, whereas the input string may not be.
  422. /// Returns the strlen of the trimmed string.
  423. size_t UTF8TrimPartialChar(char8_t* pString, size_t nLength)
  424. {
  425. size_t validPos = 0;
  426. while(validPos < nLength)
  427. {
  428. uint8_t ch = (uint8_t)pString[validPos];
  429. size_t length = utf8lengthTable[ch];
  430. // length = 0 means invalid UTF8 marker
  431. if((length == 0) || ((validPos + length) > nLength))
  432. break;
  433. else
  434. validPos += length;
  435. }
  436. pString[validPos] = 0;
  437. return validPos;
  438. }
  439. ///////////////////////////////////////////////////////////////////////////////
  440. // UTF8ReplaceInvalidChar
  441. //
  442. // This function replaces all invalidate UTF8 characters with the user provided
  443. // 8-bit replacement. The returned character array is guaranteed null-terminated.
  444. //
  445. EASTDC_API char8_t* UTF8ReplaceInvalidChar(const char8_t* pIn, size_t nLength, char8_t* pOut, char8_t replaceWith)
  446. {
  447. size_t validPos = 0;
  448. while(validPos < nLength)
  449. {
  450. uint8_t ch = (uint8_t)pIn[validPos];
  451. size_t length = utf8lengthTable[ch];
  452. // length = 0 means invalid UTF8 marker
  453. if((length == 0) || ((validPos + length) > nLength))
  454. {
  455. pOut[validPos++] = replaceWith;
  456. }
  457. else
  458. {
  459. for(auto i = validPos; i < validPos + length; i++)
  460. pOut[i] = pIn[i];
  461. validPos += length;
  462. }
  463. }
  464. pOut[validPos] = 0;
  465. return pOut + validPos;
  466. }
  467. ///////////////////////////////////////////////////////////////////////////////
  468. // MatchPattern
  469. //
  470. // This function is recursively called on substrings.
  471. // Used by the WildcardMatch function.
  472. //
  473. template <class CharT>
  474. bool MatchPattern(const CharT* pElement, const CharT* pPattern)
  475. {
  476. if((*pPattern == (CharT)'*') && !pPattern[1])
  477. return true; // The pattern is set to match everything, so return true.
  478. else if(!*pElement && *pPattern)
  479. return false; // The element is empty but the pattern is not, so return false.
  480. else if(!*pElement)
  481. return true; // The element and pattern are both empty, so we are done. Return true.
  482. else
  483. {
  484. if(*pPattern == (CharT)'*')
  485. {
  486. if(MatchPattern(pElement, pPattern+1)) // What this section does is try to match source segments to
  487. return true; // the '*' portion of the pattern. As many parts of the source that
  488. else // can be assigned to the '*' portion of the pattern are done. If
  489. return MatchPattern(pElement+1, pPattern); // not possible, we pop out of the whole thing.
  490. }
  491. else if(*pPattern == (CharT)'?')
  492. return MatchPattern(pElement+1, pPattern+1); // The pattern accepts any character here, so move onto the next character.
  493. else
  494. {
  495. if(*pElement == *pPattern)
  496. return MatchPattern(pElement+1, pPattern+1); // The current element and pattern chars match, so move onto next character.
  497. else
  498. return false; // The current element char simply doesn't match the pattern char, so return false.
  499. }
  500. }
  501. // return true; // This should never get executed, but some compilers might not be smart enough to realize it.
  502. }
  503. ///////////////////////////////////////////////////////////////////////////////
  504. // WildcardMatch
  505. //
  506. // We go through extra effort below to avoid doing memory allocation in most cases.
  507. //
  508. EASTDC_API bool WildcardMatch(const char8_t* pString, const char8_t* pPattern, bool bCaseSensitive)
  509. {
  510. if(bCaseSensitive)
  511. return MatchPattern(pString, pPattern);
  512. else
  513. {
  514. // Do efficient string conversion to lower case...
  515. char8_t pStringLBuffer[384];
  516. char8_t* pStringL;
  517. char8_t* pStringLAllocated;
  518. size_t nStringLLength = Strlen(pString);
  519. if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
  520. {
  521. pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char[]") char[nStringLLength + 1];
  522. pStringL = pStringLAllocated;
  523. }
  524. else
  525. {
  526. pStringLAllocated = NULL;
  527. pStringL = pStringLBuffer;
  528. }
  529. Strcpy(pStringL, pString);
  530. Strlwr(pStringL);
  531. // Do efficient pattern conversion to lower case...
  532. char8_t pPatternLBuffer[32];
  533. char8_t* pPatternL;
  534. char8_t* pPatternLAllocated;
  535. size_t nPatternLLength = Strlen(pPattern);
  536. if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
  537. {
  538. pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char[]") char[nPatternLLength + 1];
  539. pPatternL = pPatternLAllocated;
  540. }
  541. else
  542. {
  543. pPatternLAllocated = NULL;
  544. pPatternL = pPatternLBuffer;
  545. }
  546. Strcpy(pPatternL, pPattern);
  547. Strlwr(pPatternL);
  548. const bool bResult = MatchPattern(pStringL, pPatternL);
  549. delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
  550. delete[] pPatternLAllocated;
  551. return bResult;
  552. }
  553. }
  554. ///////////////////////////////////////////////////////////////////////////////
  555. // WildcardMatch
  556. //
  557. // We go through extra effort below to avoid doing memory allocation in most cases.
  558. //
  559. EASTDC_API bool WildcardMatch(const char16_t* pString, const char16_t* pPattern, bool bCaseSensitive)
  560. {
  561. if(bCaseSensitive)
  562. return MatchPattern(pString, pPattern);
  563. else
  564. {
  565. // Do efficient string conversion to lower case...
  566. char16_t pStringLBuffer[384];
  567. char16_t* pStringL;
  568. char16_t* pStringLAllocated;
  569. size_t nStringLLength = Strlen(pString);
  570. if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
  571. {
  572. pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char16[]") char16_t[nStringLLength + 1];
  573. pStringL = pStringLAllocated;
  574. }
  575. else
  576. {
  577. pStringLAllocated = NULL;
  578. pStringL = pStringLBuffer;
  579. }
  580. Strcpy(pStringL, pString);
  581. Strlwr(pStringL);
  582. // Do efficient pattern conversion to lower case...
  583. char16_t pPatternLBuffer[32];
  584. char16_t* pPatternL;
  585. char16_t* pPatternLAllocated;
  586. size_t nPatternLLength = Strlen(pPattern);
  587. if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
  588. {
  589. pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char16[]") char16_t[nPatternLLength + 1];
  590. pPatternL = pPatternLAllocated;
  591. }
  592. else
  593. {
  594. pPatternLAllocated = NULL;
  595. pPatternL = pPatternLBuffer;
  596. }
  597. Strcpy(pPatternL, pPattern);
  598. Strlwr(pPatternL);
  599. const bool bResult = MatchPattern(pStringL, pPatternL);
  600. delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
  601. delete[] pPatternLAllocated;
  602. return bResult;
  603. }
  604. }
  605. ///////////////////////////////////////////////////////////////////////////////
  606. // WildcardMatch
  607. //
  608. // We go through extra effort below to avoid doing memory allocation in most cases.
  609. //
  610. EASTDC_API bool WildcardMatch(const char32_t* pString, const char32_t* pPattern, bool bCaseSensitive)
  611. {
  612. if(bCaseSensitive)
  613. return MatchPattern(pString, pPattern);
  614. else
  615. {
  616. // Do efficient string conversion to lower case...
  617. char32_t pStringLBuffer[384];
  618. char32_t* pStringL;
  619. char32_t* pStringLAllocated;
  620. size_t nStringLLength = Strlen(pString);
  621. if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
  622. {
  623. pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char32[]") char32_t[nStringLLength + 1];
  624. pStringL = pStringLAllocated;
  625. }
  626. else
  627. {
  628. pStringLAllocated = NULL;
  629. pStringL = pStringLBuffer;
  630. }
  631. Strcpy(pStringL, pString);
  632. Strlwr(pStringL);
  633. // Do efficient pattern conversion to lower case...
  634. char32_t pPatternLBuffer[32];
  635. char32_t* pPatternL;
  636. char32_t* pPatternLAllocated;
  637. size_t nPatternLLength = Strlen(pPattern);
  638. if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
  639. {
  640. pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char32[]") char32_t[nPatternLLength + 1];
  641. pPatternL = pPatternLAllocated;
  642. }
  643. else
  644. {
  645. pPatternLAllocated = NULL;
  646. pPatternL = pPatternLBuffer;
  647. }
  648. Strcpy(pPatternL, pPattern);
  649. Strlwr(pPatternL);
  650. const bool bResult = MatchPattern(pStringL, pPatternL);
  651. delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
  652. delete[] pPatternLAllocated;
  653. return bResult;
  654. }
  655. }
  656. //////////////////////////////////////////////////////////////////////////
  657. // GetTextLine
  658. //
  659. EASTDC_API const char8_t* GetTextLine(const char8_t* pText, const char8_t* pTextEnd, const char8_t** ppNewText)
  660. {
  661. if(pText < pTextEnd)
  662. {
  663. while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
  664. ++pText;
  665. if(ppNewText)
  666. {
  667. *ppNewText = pText;
  668. if(*ppNewText < pTextEnd)
  669. {
  670. if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
  671. ++*ppNewText;
  672. }
  673. }
  674. }
  675. else if(ppNewText)
  676. *ppNewText = pTextEnd;
  677. return pText;
  678. }
  679. //////////////////////////////////////////////////////////////////////////
  680. // GetTextLine
  681. //
  682. EASTDC_API const char16_t* GetTextLine(const char16_t* pText, const char16_t* pTextEnd, const char16_t** ppNewText)
  683. {
  684. if(pText < pTextEnd)
  685. {
  686. while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
  687. ++pText;
  688. if(ppNewText)
  689. {
  690. *ppNewText = pText;
  691. if(*ppNewText < pTextEnd)
  692. {
  693. if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
  694. ++*ppNewText;
  695. }
  696. }
  697. }
  698. else if(ppNewText)
  699. *ppNewText = pTextEnd;
  700. return pText;
  701. }
  702. //////////////////////////////////////////////////////////////////////////
  703. // GetTextLine
  704. //
  705. EASTDC_API const char32_t* GetTextLine(const char32_t* pText, const char32_t* pTextEnd, const char32_t** ppNewText)
  706. {
  707. if(pText < pTextEnd)
  708. {
  709. while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
  710. ++pText;
  711. if(ppNewText)
  712. {
  713. *ppNewText = pText;
  714. if(*ppNewText < pTextEnd)
  715. {
  716. if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
  717. ++*ppNewText;
  718. }
  719. }
  720. }
  721. else if(ppNewText)
  722. *ppNewText = pTextEnd;
  723. return pText;
  724. }
  725. EASTDC_API bool ParseDelimitedText(const char8_t* pText, const char8_t* pTextEnd, char8_t cDelimiter,
  726. const char8_t*& pToken, const char8_t*& pTokenEnd, const char8_t** ppNewText)
  727. {
  728. int nQuoteLevel = 0;
  729. bool bDelimiterFound = false;
  730. // We remove leading spaces.
  731. for(pToken = pText; pToken < pTextEnd; ++pToken)
  732. {
  733. if((*pToken != ' ') && (*pToken != '\t'))
  734. break;
  735. }
  736. for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
  737. {
  738. const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
  739. if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
  740. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
  741. else
  742. bDelimiterFound = (*pTokenEnd == cDelimiter);
  743. if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
  744. {
  745. if(!bDelimiterFound)
  746. ++pTokenEnd;
  747. const bool bInQuotes = ((nQuoteLevel & 1) != 0);
  748. if(!bInQuotes || bLastCharacter) // If not within a quoted section...
  749. {
  750. if(ppNewText)
  751. *ppNewText = pTokenEnd;
  752. if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
  753. {
  754. // Eliminate spaces before the trailing delimiter.
  755. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
  756. pTokenEnd--;
  757. }
  758. if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
  759. {
  760. pToken++;
  761. pTokenEnd--;
  762. }
  763. return true;
  764. }
  765. }
  766. else if(*pTokenEnd == '"')
  767. nQuoteLevel++;
  768. }
  769. if(ppNewText)
  770. *ppNewText = pTokenEnd;
  771. return false;
  772. }
  773. //////////////////////////////////////////////////////////////////////////
  774. // ParseDelimitedText
  775. //
  776. // This function takes a line text that has fields separated by delimiters
  777. // and parses the line into the component fields. It is common to read
  778. // command lines like this or to parse ini file settings like this.
  779. //
  780. EASTDC_API bool ParseDelimitedText(const char16_t* pText, const char16_t* pTextEnd, char16_t cDelimiter,
  781. const char16_t*& pToken, const char16_t*& pTokenEnd, const char16_t** ppNewText)
  782. {
  783. int nQuoteLevel = 0;
  784. bool bDelimiterFound = false;
  785. // We remove leading spaces.
  786. for(pToken = pText; pToken < pTextEnd; ++pToken)
  787. {
  788. if((*pToken != ' ') && (*pToken != '\t'))
  789. break;
  790. }
  791. for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
  792. {
  793. const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
  794. if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
  795. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
  796. else
  797. bDelimiterFound = (*pTokenEnd == cDelimiter);
  798. if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
  799. {
  800. if(!bDelimiterFound)
  801. ++pTokenEnd;
  802. const bool bInQuotes = ((nQuoteLevel & 1) != 0);
  803. if(!bInQuotes || bLastCharacter) // If not within a quoted section...
  804. {
  805. if(ppNewText)
  806. *ppNewText = pTokenEnd;
  807. if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
  808. {
  809. // Eliminate spaces before the trailing delimiter.
  810. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
  811. pTokenEnd--;
  812. }
  813. if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
  814. {
  815. pToken++;
  816. pTokenEnd--;
  817. }
  818. return true;
  819. }
  820. }
  821. else if(*pTokenEnd == '"')
  822. nQuoteLevel++;
  823. }
  824. if(ppNewText)
  825. *ppNewText = pTokenEnd;
  826. return false;
  827. }
  828. //////////////////////////////////////////////////////////////////////////
  829. // ParseDelimitedText
  830. //
  831. // This function takes a line text that has fields separated by delimiters
  832. // and parses the line into the component fields. It is common to read
  833. // command lines like this or to parse ini file settings like this.
  834. //
  835. EASTDC_API bool ParseDelimitedText(const char32_t* pText, const char32_t* pTextEnd, char32_t cDelimiter,
  836. const char32_t*& pToken, const char32_t*& pTokenEnd, const char32_t** ppNewText)
  837. {
  838. int nQuoteLevel = 0;
  839. bool bDelimiterFound = false;
  840. // We remove leading spaces.
  841. for(pToken = pText; pToken < pTextEnd; ++pToken)
  842. {
  843. if((*pToken != ' ') && (*pToken != '\t'))
  844. break;
  845. }
  846. for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
  847. {
  848. const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
  849. if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
  850. bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
  851. else
  852. bDelimiterFound = (*pTokenEnd == cDelimiter);
  853. if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
  854. {
  855. if(!bDelimiterFound)
  856. ++pTokenEnd;
  857. const bool bInQuotes = ((nQuoteLevel & 1) != 0);
  858. if(!bInQuotes || bLastCharacter) // If not within a quoted section...
  859. {
  860. if(ppNewText)
  861. *ppNewText = pTokenEnd;
  862. if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
  863. {
  864. // Eliminate spaces before the trailing delimiter.
  865. while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
  866. pTokenEnd--;
  867. }
  868. if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
  869. {
  870. pToken++;
  871. pTokenEnd--;
  872. }
  873. return true;
  874. }
  875. }
  876. else if(*pTokenEnd == '"')
  877. nQuoteLevel++;
  878. }
  879. if(ppNewText)
  880. *ppNewText = pTokenEnd;
  881. return false;
  882. }
  883. ///////////////////////////////////////////////////////////////////////////////
  884. // ConvertBinaryDataToASCIIArray
  885. //
  886. // Since every binary byte converts to exactly 2 ascii bytes, the ASCII
  887. // array must have space for at least twice the amount of bytes
  888. // as 'nBinaryDataLength' + 1.
  889. //
  890. EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char8_t* pASCIIArray)
  891. {
  892. const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
  893. const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
  894. while(pBinaryData < pEnd)
  895. {
  896. *pASCIIArray = (char8_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
  897. if(*pASCIIArray > '9')
  898. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  899. pASCIIArray++;
  900. *pASCIIArray = (char8_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
  901. if(*pASCIIArray > '9')
  902. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  903. pASCIIArray++;
  904. pBinaryData++;
  905. }
  906. *pASCIIArray = '\0';
  907. }
  908. EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char16_t* pASCIIArray)
  909. {
  910. const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
  911. const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
  912. while(pBinaryData < pEnd)
  913. {
  914. *pASCIIArray = (char16_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
  915. if(*pASCIIArray > '9')
  916. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  917. pASCIIArray++;
  918. *pASCIIArray = (char16_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
  919. if(*pASCIIArray > '9')
  920. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  921. pASCIIArray++;
  922. pBinaryData++;
  923. }
  924. *pASCIIArray = '\0';
  925. }
  926. EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char32_t* pASCIIArray)
  927. {
  928. const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
  929. const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
  930. while(pBinaryData < pEnd)
  931. {
  932. *pASCIIArray = (char32_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
  933. if(*pASCIIArray > '9')
  934. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  935. pASCIIArray++;
  936. *pASCIIArray = (char32_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
  937. if(*pASCIIArray > '9')
  938. *pASCIIArray += 7; // Convert the ':' to 'A', for example.
  939. pASCIIArray++;
  940. pBinaryData++;
  941. }
  942. *pASCIIArray = '\0';
  943. }
  944. //////////////////////////////////////////////////////////////////////////////
  945. // ConvertASCIIArrayToBinaryData (8 bit version)
  946. //
  947. // We have a boolean return value because it is possible that the ascii data is
  948. // corrupt. We check for this corruption and return false if so, while converting
  949. // all corrupt bytes to valid ones.
  950. //
  951. EASTDC_API bool ConvertASCIIArrayToBinaryData(const char8_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
  952. {
  953. uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
  954. const char8_t* pEnd = pASCIIArray + nASCIIArrayLength;
  955. char8_t cTemp;
  956. bool bReturnValue(true);
  957. while(pASCIIArray < pEnd)
  958. {
  959. *pBinaryData8 = 0;
  960. for(int j = 4; j >= 0; j -= 4)
  961. {
  962. cTemp = *pASCIIArray;
  963. if(cTemp < '0') // Do some bounds checking.
  964. {
  965. cTemp = '0';
  966. bReturnValue = false;
  967. }
  968. else if(cTemp > 'F') // Do some bounds checking.
  969. {
  970. if(cTemp >= 'a' && cTemp <= 'f')
  971. cTemp -= 39; // Convert 'a' to ':'.
  972. else
  973. {
  974. cTemp = '0';
  975. bReturnValue = false;
  976. }
  977. }
  978. else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
  979. {
  980. cTemp = '0';
  981. bReturnValue = false;
  982. }
  983. else if(cTemp >= 'A')
  984. cTemp -= 7;
  985. *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
  986. pASCIIArray++;
  987. }
  988. pBinaryData8++;
  989. }
  990. return bReturnValue;
  991. }
  992. //////////////////////////////////////////////////////////////////////////////
  993. // ConvertASCIIArrayToBinaryData (16 bit version)
  994. //
  995. // We have a boolean return value because it is possible that the ascii data is
  996. // corrupt. We check for this corruption and return false if so, while converting
  997. // all corrupt bytes to valid ones.
  998. //
  999. EASTDC_API bool ConvertASCIIArrayToBinaryData(const char16_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
  1000. {
  1001. uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
  1002. const char16_t* pEnd = pASCIIArray + nASCIIArrayLength;
  1003. char16_t cTemp;
  1004. bool bReturnValue(true);
  1005. while(pASCIIArray < pEnd)
  1006. {
  1007. *pBinaryData8 = 0;
  1008. for(int j = 4; j >= 0; j -= 4)
  1009. {
  1010. cTemp = *pASCIIArray;
  1011. if(cTemp < '0') // Do some bounds checking.
  1012. {
  1013. cTemp = '0';
  1014. bReturnValue = false;
  1015. }
  1016. else if(cTemp > 'F') // Do some bounds checking.
  1017. {
  1018. if(cTemp >= 'a' && cTemp <= 'f')
  1019. cTemp -= 39; // Convert 'a' to ':'.
  1020. else
  1021. {
  1022. cTemp = '0';
  1023. bReturnValue = false;
  1024. }
  1025. }
  1026. else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
  1027. {
  1028. cTemp = '0';
  1029. bReturnValue = false;
  1030. }
  1031. else if(cTemp >= 'A')
  1032. cTemp -= 7;
  1033. *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
  1034. pASCIIArray++;
  1035. }
  1036. pBinaryData8++;
  1037. }
  1038. return bReturnValue;
  1039. }
  1040. //////////////////////////////////////////////////////////////////////////////
  1041. // ConvertASCIIArrayToBinaryData (32 bit version)
  1042. //
  1043. // We have a boolean return value because it is possible that the ascii data is
  1044. // corrupt. We check for this corruption and return false if so, while converting
  1045. // all corrupt bytes to valid ones.
  1046. //
  1047. EASTDC_API bool ConvertASCIIArrayToBinaryData(const char32_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
  1048. {
  1049. uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
  1050. const char32_t* pEnd = pASCIIArray + nASCIIArrayLength;
  1051. char32_t cTemp;
  1052. bool bReturnValue(true);
  1053. while(pASCIIArray < pEnd)
  1054. {
  1055. *pBinaryData8 = 0;
  1056. for(int j = 4; j >= 0; j -= 4)
  1057. {
  1058. cTemp = *pASCIIArray;
  1059. if(cTemp < '0') // Do some bounds checking.
  1060. {
  1061. cTemp = '0';
  1062. bReturnValue = false;
  1063. }
  1064. else if(cTemp > 'F') // Do some bounds checking.
  1065. {
  1066. if(cTemp >= 'a' && cTemp <= 'f')
  1067. cTemp -= 39; // Convert 'a' to ':'.
  1068. else
  1069. {
  1070. cTemp = '0';
  1071. bReturnValue = false;
  1072. }
  1073. }
  1074. else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
  1075. {
  1076. cTemp = '0';
  1077. bReturnValue = false;
  1078. }
  1079. else if(cTemp >= 'A')
  1080. cTemp -= 7;
  1081. *pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
  1082. pASCIIArray++;
  1083. }
  1084. pBinaryData8++;
  1085. }
  1086. return bReturnValue;
  1087. }
  1088. //////////////////////////////////////////////////////////////////////////////
  1089. // SplitTokenDelimited (8 bit version)
  1090. //
  1091. EASTDC_API bool SplitTokenDelimited(const char8_t* pSource, size_t nSourceLength, char8_t cDelimiter,
  1092. char8_t* pToken, size_t nTokenLength, const char8_t** ppNewSource)
  1093. {
  1094. // terminate the token (so it appears empty if we don't find anything)
  1095. if(pToken && nTokenLength)
  1096. *pToken = 0;
  1097. if(pSource && nSourceLength && *pSource)
  1098. {
  1099. // look for the delimiter
  1100. for(size_t i = 0; i < nSourceLength && *pSource; i++)
  1101. {
  1102. const char8_t cTemp(*pSource);
  1103. // update new source pointer if present
  1104. if(ppNewSource)
  1105. (*ppNewSource)++;
  1106. if(cTemp == cDelimiter) // If there is a delimiter match...
  1107. break; // We are done.
  1108. else
  1109. {
  1110. // keep moving characters into the token until we find the delimiter or reached the end of the token string
  1111. if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
  1112. {
  1113. *pToken = cTemp; // add the character
  1114. pToken++; // increment the token pointer
  1115. *pToken = 0; // insert terminating null character
  1116. }
  1117. pSource++; // increment source pointer
  1118. }
  1119. }
  1120. return true;
  1121. }
  1122. return false;
  1123. }
  1124. //////////////////////////////////////////////////////////////////////////////
  1125. // SplitTokenDelimited (16 bit version)
  1126. //
  1127. // Implemented by Blazej Stompel and Paul Pedriana
  1128. //
  1129. EASTDC_API bool SplitTokenDelimited(const char16_t* pSource, size_t nSourceLength, char16_t cDelimiter,
  1130. char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
  1131. {
  1132. // terminate the token (so it appears empty if we don't find anything)
  1133. if(pToken && nTokenLength)
  1134. *pToken = 0;
  1135. if(pSource && nSourceLength && *pSource)
  1136. {
  1137. // look for the delimiter
  1138. for(size_t i = 0; i < nSourceLength && *pSource; i++)
  1139. {
  1140. const char16_t cTemp(*pSource);
  1141. // update new source pointer if present
  1142. if(ppNewSource)
  1143. (*ppNewSource)++;
  1144. if(cTemp == cDelimiter) // If there is a delimiter match...
  1145. break; // We are done.
  1146. else
  1147. {
  1148. // keep moving characters into the token until we find the delimiter or reached the end of the token string
  1149. if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
  1150. {
  1151. *pToken = cTemp; // add the character
  1152. pToken++; // increment the token pointer
  1153. *pToken = 0; // insert terminating null character
  1154. }
  1155. pSource++; // increment source pointer
  1156. }
  1157. }
  1158. return true;
  1159. }
  1160. return false;
  1161. }
  1162. //////////////////////////////////////////////////////////////////////////////
  1163. // SplitTokenDelimited (32 bit version)
  1164. //
  1165. // Implemented by Blazej Stompel and Paul Pedriana
  1166. //
  1167. EASTDC_API bool SplitTokenDelimited(const char32_t* pSource, size_t nSourceLength, char32_t cDelimiter,
  1168. char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
  1169. {
  1170. // terminate the token (so it appears empty if we don't find anything)
  1171. if(pToken && nTokenLength)
  1172. *pToken = 0;
  1173. if(pSource && nSourceLength && *pSource)
  1174. {
  1175. // look for the delimiter
  1176. for(size_t i = 0; i < nSourceLength && *pSource; i++)
  1177. {
  1178. const char32_t cTemp(*pSource);
  1179. // update new source pointer if present
  1180. if(ppNewSource)
  1181. (*ppNewSource)++;
  1182. if(cTemp == cDelimiter) // If there is a delimiter match...
  1183. break; // We are done.
  1184. else
  1185. {
  1186. // keep moving characters into the token until we find the delimiter or reached the end of the token string
  1187. if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
  1188. {
  1189. *pToken = cTemp; // add the character
  1190. pToken++; // increment the token pointer
  1191. *pToken = 0; // insert terminating null character
  1192. }
  1193. pSource++; // increment source pointer
  1194. }
  1195. }
  1196. return true;
  1197. }
  1198. return false;
  1199. }
  1200. //////////////////////////////////////////////////////////////////////////////
  1201. // SplitTokenSeparated (8 bit version)
  1202. //
  1203. EASTDC_API bool SplitTokenSeparated(const char8_t* pSource, size_t nSourceLength, char8_t c,
  1204. char8_t* pToken, size_t nTokenLength, const char8_t** ppNewSource)
  1205. {
  1206. // terminate the token (so it appears empty if we don't find anything)
  1207. if(pToken && nTokenLength)
  1208. *pToken = '\0';
  1209. if(pSource)
  1210. {
  1211. // keep track of how many characters we have written to the token buffer
  1212. size_t nTokenIndex = 0;
  1213. // keep track whether we found the token and if we are done reading it
  1214. bool bFoundToken = false;
  1215. bool bReadToken = false;
  1216. // look for the separators
  1217. for(size_t i = 0; i < nSourceLength; i++)
  1218. {
  1219. // get the character
  1220. const char8_t cTemp(*pSource);
  1221. // quit if we found the terminating null character
  1222. if(cTemp != '\0')
  1223. {
  1224. // is the character not a separator ?
  1225. if(cTemp != c)
  1226. {
  1227. // we have a token
  1228. bFoundToken = true;
  1229. // were we done reading the token ?
  1230. if(bReadToken)
  1231. return true;
  1232. else
  1233. {
  1234. // add the character to the token
  1235. if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
  1236. {
  1237. // add the character
  1238. *pToken = cTemp;
  1239. // increment the token pointer
  1240. pToken++;
  1241. // and index
  1242. nTokenIndex++;
  1243. // insert terminating null character
  1244. *pToken = '\0';
  1245. }
  1246. }
  1247. }
  1248. else
  1249. {
  1250. // the character is a separator - if we found our token then we are done reading it
  1251. if(bFoundToken)
  1252. bReadToken = true;
  1253. }
  1254. // update new source pointer if present
  1255. if(ppNewSource)
  1256. (*ppNewSource)++;
  1257. // increment source pointer
  1258. pSource++;
  1259. }
  1260. else
  1261. {
  1262. // we have reached the end of the string
  1263. break;
  1264. }
  1265. }
  1266. return bFoundToken;
  1267. }
  1268. return false;
  1269. }
  1270. //////////////////////////////////////////////////////////////////////////////
  1271. // SplitTokenSeparated (16 bit version)
  1272. //
  1273. // Implemented by Blazej Stompel
  1274. //
  1275. // Unit test can be found in Foundation\Test\UnitTests
  1276. //
  1277. EASTDC_API bool SplitTokenSeparated(const char16_t* pSource, size_t nSourceLength, char16_t c,
  1278. char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
  1279. {
  1280. // terminate the token (so it appears empty if we don't find anything)
  1281. if(pToken && nTokenLength)
  1282. *pToken = '\0';
  1283. if(pSource)
  1284. {
  1285. // keep track of how many characters we have written to the token buffer
  1286. size_t nTokenIndex = 0;
  1287. // keep track whether we found the token and if we are done reading it
  1288. bool bFoundToken = false;
  1289. bool bReadToken = false;
  1290. // look for the separators
  1291. for(size_t i = 0; i < nSourceLength; i++)
  1292. {
  1293. // get the character
  1294. const char16_t cTemp(*pSource);
  1295. // quit if we found the terminating null character
  1296. if(cTemp != '\0')
  1297. {
  1298. // is the character not a separator ?
  1299. if(cTemp != c)
  1300. {
  1301. // we have a token
  1302. bFoundToken = true;
  1303. // were we done reading the token ?
  1304. if(bReadToken)
  1305. return true;
  1306. else
  1307. {
  1308. // add the character to the token
  1309. if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
  1310. {
  1311. // add the character
  1312. *pToken = cTemp;
  1313. // increment the token pointer
  1314. pToken++;
  1315. // and index
  1316. nTokenIndex++;
  1317. // insert terminating null character
  1318. *pToken = '\0';
  1319. }
  1320. }
  1321. }
  1322. else
  1323. {
  1324. // the character is a separator - if we found our token then we are done reading it
  1325. if(bFoundToken)
  1326. bReadToken = true;
  1327. }
  1328. // update new source pointer if present
  1329. if(ppNewSource)
  1330. (*ppNewSource)++;
  1331. // increment source pointer
  1332. pSource++;
  1333. }
  1334. else
  1335. {
  1336. // we have reached the end of the string
  1337. break;
  1338. }
  1339. }
  1340. return bFoundToken;
  1341. }
  1342. return false;
  1343. }
  1344. //////////////////////////////////////////////////////////////////////////////
  1345. // SplitTokenSeparated (32 bit version)
  1346. //
  1347. // Implemented by Blazej Stompel
  1348. //
  1349. // Unit test can be found in Foundation\Test\UnitTests
  1350. //
  1351. EASTDC_API bool SplitTokenSeparated(const char32_t* pSource, size_t nSourceLength, char32_t c,
  1352. char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
  1353. {
  1354. // terminate the token (so it appears empty if we don't find anything)
  1355. if(pToken && nTokenLength)
  1356. *pToken = '\0';
  1357. if(pSource)
  1358. {
  1359. // keep track of how many characters we have written to the token buffer
  1360. size_t nTokenIndex = 0;
  1361. // keep track whether we found the token and if we are done reading it
  1362. bool bFoundToken = false;
  1363. bool bReadToken = false;
  1364. // look for the separators
  1365. for(size_t i = 0; i < nSourceLength; i++)
  1366. {
  1367. // get the character
  1368. const char32_t cTemp(*pSource);
  1369. // quit if we found the terminating null character
  1370. if(cTemp != '\0')
  1371. {
  1372. // is the character not a separator ?
  1373. if(cTemp != c)
  1374. {
  1375. // we have a token
  1376. bFoundToken = true;
  1377. // were we done reading the token ?
  1378. if(bReadToken)
  1379. return true;
  1380. else
  1381. {
  1382. // add the character to the token
  1383. if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
  1384. {
  1385. // add the character
  1386. *pToken = cTemp;
  1387. // increment the token pointer
  1388. pToken++;
  1389. // and index
  1390. nTokenIndex++;
  1391. // insert terminating null character
  1392. *pToken = '\0';
  1393. }
  1394. }
  1395. }
  1396. else
  1397. {
  1398. // the character is a separator - if we found our token then we are done reading it
  1399. if(bFoundToken)
  1400. bReadToken = true;
  1401. }
  1402. // update new source pointer if present
  1403. if(ppNewSource)
  1404. (*ppNewSource)++;
  1405. // increment source pointer
  1406. pSource++;
  1407. }
  1408. else
  1409. {
  1410. // we have reached the end of the string
  1411. break;
  1412. }
  1413. }
  1414. return bFoundToken;
  1415. }
  1416. return false;
  1417. }
  1418. ///////////////////////////////////////////////////////////////////////////////
  1419. // Boyer-Moore string search
  1420. //
  1421. // This is the "turbo" implementation defined at http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140.
  1422. // Boyer-Moore is a very fast string search compared to most others, including
  1423. // those in the STL. However, you need to be searching a string of at least 100
  1424. // chars and have a search pattern of at least 3 characters for the speed to show,
  1425. // as Boyer-Moore has a startup precalculation that costs some cycles.
  1426. // This startup precalculation is proportional to the size of your search pattern
  1427. // and the size of the alphabet in use. Thus, doing Boyer-Moore searches on the
  1428. // entire Unicode alphabet is going to incur a fairly expensive precalculation cost.
  1429. //
  1430. // This is a private function used by BoyerMooreSearch.
  1431. //
  1432. static void BoyerMooreBadCharacterCalc(const char* pPattern, int nPatternLength,
  1433. int* pAlphabetBuffer, int nAlphabetBufferSize)
  1434. {
  1435. int i;
  1436. for(i = 0; i < nAlphabetBufferSize; ++i)
  1437. pAlphabetBuffer[i] = nPatternLength;
  1438. for(i = 0; i < (nPatternLength - 1); ++i)
  1439. pAlphabetBuffer[(int)pPattern[i]] = (nPatternLength - i) - 1;
  1440. }
  1441. // This is a private function used by BoyerMooreSearch.
  1442. //
  1443. static void BoyerMooreGoodSuffixCalc(const char* pPattern, int nPatternLength,
  1444. int* pPatternBuffer1, int* pPatternBuffer2)
  1445. {
  1446. int i;
  1447. int j = 0;
  1448. int f = 0;
  1449. int g = nPatternLength - 1;
  1450. pPatternBuffer2[nPatternLength - 1] = nPatternLength;
  1451. for(i = nPatternLength - 2; i >= 0; --i)
  1452. {
  1453. if((i > g) && pPatternBuffer2[((i + nPatternLength) - 1) - f] < (i - g))
  1454. pPatternBuffer2[i] = pPatternBuffer2[((i + nPatternLength) - 1) - f];
  1455. else
  1456. {
  1457. if(i < g)
  1458. g = i;
  1459. f = i;
  1460. while((g >= 0) && (pPattern[g] == pPattern[((g + nPatternLength) - 1) - f]))
  1461. --g;
  1462. pPatternBuffer2[i] = f - g;
  1463. }
  1464. }
  1465. for(i = 0; i < nPatternLength; ++i)
  1466. pPatternBuffer1[i] = nPatternLength;
  1467. for(i = nPatternLength - 1; i >= -1; --i)
  1468. {
  1469. if((i == -1) || (pPatternBuffer2[i] == (i + 1)))
  1470. {
  1471. for(; j < (nPatternLength - 1) - i; ++j)
  1472. {
  1473. if(pPatternBuffer1[j] == nPatternLength)
  1474. pPatternBuffer1[j] = (nPatternLength - 1) - i;
  1475. }
  1476. }
  1477. }
  1478. for(i = 0; i <= nPatternLength - 2; ++i)
  1479. pPatternBuffer1[(nPatternLength - 1) - pPatternBuffer2[i]] = (nPatternLength - 1) - i;
  1480. }
  1481. // Argument specification.
  1482. //
  1483. // patternBuffer1 is a user-supplied buffer and must be at least as long as the search pattern.
  1484. // patternBuffer2 is a user-supplied buffer and must be at least as long as the search pattern.
  1485. // alphabetBuffer is a user-supplied buffer and must be at least as long as the highest character value used in the searched string and search pattern.
  1486. //
  1487. EASTDC_API int BoyerMooreSearch(const char* pPattern, int nPatternLength, const char* pSearchString, int nSearchStringLength,
  1488. int* pPatternBuffer1, int* pPatternBuffer2, int* pAlphabetBuffer, int nAlphabetBufferSize)
  1489. {
  1490. // Do precalculations
  1491. BoyerMooreGoodSuffixCalc(pPattern, nPatternLength, pPatternBuffer1, pPatternBuffer2);
  1492. BoyerMooreBadCharacterCalc(pPattern, nPatternLength, pAlphabetBuffer, nAlphabetBufferSize);
  1493. // Do search
  1494. for(int j = 0, shift = nPatternLength, u = 0; j <= (nSearchStringLength - nPatternLength); j += shift)
  1495. {
  1496. int i = nPatternLength - 1;
  1497. while((i >= 0) && (pPattern[i] == pSearchString[i + j]))
  1498. {
  1499. --i;
  1500. if((u != 0) && (i == (nPatternLength - 1) - shift))
  1501. i -= u;
  1502. }
  1503. if(i < 0)
  1504. {
  1505. return j;
  1506. // Only used if we were iterating multiple found items:
  1507. //shift = pPatternBuffer1[0];
  1508. //u = nPatternLength - shift;
  1509. }
  1510. else
  1511. {
  1512. const int v = nPatternLength - 1 - i;
  1513. const int turboShift = u - v;
  1514. const int bcShift = pAlphabetBuffer[(int)pSearchString[i + j]] - nPatternLength + 1 + i;
  1515. shift = EATEXTUTIL_MAX(turboShift, bcShift);
  1516. shift = EATEXTUTIL_MAX(shift, pPatternBuffer1[i]);
  1517. if(shift == pPatternBuffer1[i])
  1518. u = EATEXTUTIL_MIN(nPatternLength - shift, v);
  1519. else
  1520. {
  1521. if(turboShift < bcShift)
  1522. shift = EATEXTUTIL_MAX(shift, u + 1);
  1523. u = 0;
  1524. }
  1525. }
  1526. }
  1527. return nPatternLength;
  1528. }
  1529. #undef EATEXTUTIL_MIN
  1530. #undef EATEXTUTIL_MAX
  1531. } // namespace StdC
  1532. } // namespace EA