StringUtilities.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /*
  2. * This source file is part of RmlUi, the HTML/CSS Interface Middleware
  3. *
  4. * For the latest information, see http://github.com/mikke89/RmlUi
  5. *
  6. * Copyright (c) 2008-2010 CodePoint Ltd, Shift Technology Ltd
  7. * Copyright (c) 2019 The RmlUi Team, and contributors
  8. *
  9. * Permission is hereby granted, free of charge, to any person obtaining a copy
  10. * of this software and associated documentation files (the "Software"), to deal
  11. * in the Software without restriction, including without limitation the rights
  12. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13. * copies of the Software, and to permit persons to whom the Software is
  14. * furnished to do so, subject to the following conditions:
  15. *
  16. * The above copyright notice and this permission notice shall be included in
  17. * all copies or substantial portions of the Software.
  18. *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  25. * THE SOFTWARE.
  26. *
  27. */
  28. #include "precompiled.h"
  29. #include "../../Include/RmlUi/Core/StringUtilities.h"
  30. #include <ctype.h>
  31. #include <stdio.h>
  32. namespace Rml {
  33. namespace Core {
  34. // Expands character-delimited list of values in a single string to a whitespace-trimmed list of values.
  35. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter)
  36. {
  37. char quote = 0;
  38. bool last_char_delimiter = true;
  39. const char* ptr = string.CString();
  40. const char* start_ptr = NULL;
  41. const char* end_ptr = ptr;
  42. while (*ptr)
  43. {
  44. // Switch into quote mode if the last char was a delimeter ( excluding whitespace )
  45. // and we're not already in quote mode
  46. if (last_char_delimiter && !quote && (*ptr == '"' || *ptr == '\''))
  47. {
  48. quote = *ptr;
  49. }
  50. // Switch out of quote mode if we encounter a quote that hasn't been escaped
  51. else if (*ptr == quote && *(ptr-1) != '\\')
  52. {
  53. quote = 0;
  54. }
  55. // If we encouter a delimiter while not in quote mode, add the item to the list
  56. else if (*ptr == delimiter && !quote)
  57. {
  58. if (start_ptr)
  59. string_list.push_back(String(start_ptr, end_ptr + 1));
  60. else
  61. string_list.push_back("");
  62. last_char_delimiter = true;
  63. start_ptr = NULL;
  64. }
  65. // Otherwise if its not white space or we're in quote mode, advance the pointers
  66. else if (!isspace(*ptr) || quote)
  67. {
  68. if (!start_ptr)
  69. start_ptr = ptr;
  70. end_ptr = ptr;
  71. last_char_delimiter = false;
  72. }
  73. ptr++;
  74. }
  75. // If there's data pending, add it.
  76. if (start_ptr)
  77. string_list.push_back(String(start_ptr, end_ptr + 1));
  78. }
  79. // Joins a list of string values into a single string separated by a character delimiter.
  80. void StringUtilities::JoinString(String& string, const StringList& string_list, const char delimiter)
  81. {
  82. for (size_t i = 0; i < string_list.size(); i++)
  83. {
  84. string += string_list[i];
  85. if (delimiter != '\0' && i < string_list.size() - 1)
  86. string.Append(delimiter);
  87. }
  88. }
  89. // Hashes a string of data to an integer value using the FNV algorithm.
  90. Hash StringUtilities::FNVHash(const char *string, int length)
  91. {
  92. // FNV-1 hash algorithm
  93. Hash hval = 0;
  94. unsigned char* bp = (unsigned char *)string; // start of buffer
  95. unsigned char* be = (unsigned char *)string + length;
  96. // FNV-1 hash each octet in the buffer
  97. while (*bp || (length >= 0 && bp < be))
  98. {
  99. // xor the bottom with the current octet
  100. hval ^= *bp++;
  101. /* multiply by the 32 bit FNV magic prime mod 2^32 */
  102. #if !defined(__GNUC__)
  103. const unsigned int FNV_32_PRIME = ((unsigned int)16777619);
  104. hval *= FNV_32_PRIME;
  105. #else
  106. hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
  107. #endif
  108. }
  109. return hval;
  110. }
  111. // Defines, helper functions for the UTF8 / UCS2 conversion functions.
  112. #define _NXT 0x80
  113. #define _SEQ2 0xc0
  114. #define _SEQ3 0xe0
  115. #define _SEQ4 0xf0
  116. #define _SEQ5 0xf8
  117. #define _SEQ6 0xfc
  118. #define _BOM 0xfeff
  119. static int __wchar_forbidden(unsigned int sym)
  120. {
  121. // Surrogate pairs
  122. if (sym >= 0xd800 && sym <= 0xdfff)
  123. return -1;
  124. return 0;
  125. }
  126. static int __utf8_forbidden(unsigned char octet)
  127. {
  128. switch (octet)
  129. {
  130. case 0xc0:
  131. case 0xc1:
  132. case 0xf5:
  133. case 0xff:
  134. return -1;
  135. default:
  136. return 0;
  137. }
  138. }
  139. // Converts a character array in UTF-8 encoding to a vector of words.
  140. bool StringUtilities::UTF8toUCS2(const String& input, std::vector< word >& output)
  141. {
  142. if (input.Empty())
  143. return true;
  144. unsigned char* p = (unsigned char*) input.CString();
  145. unsigned char* lim = p + input.Length();
  146. // Skip the UTF-8 byte order marker if it exists.
  147. if (input.Substring(0, 3) == "\xEF\xBB\xBF")
  148. p += 3;
  149. int num_bytes;
  150. for (; p < lim; p += num_bytes)
  151. {
  152. if (__utf8_forbidden(*p) != 0)
  153. return false;
  154. // Get number of bytes for one wide character.
  155. word high;
  156. num_bytes = 1;
  157. if ((*p & 0x80) == 0)
  158. {
  159. high = (wchar_t)*p;
  160. }
  161. else if ((*p & 0xe0) == _SEQ2)
  162. {
  163. num_bytes = 2;
  164. high = (wchar_t)(*p & 0x1f);
  165. }
  166. else if ((*p & 0xf0) == _SEQ3)
  167. {
  168. num_bytes = 3;
  169. high = (wchar_t)(*p & 0x0f);
  170. }
  171. else if ((*p & 0xf8) == _SEQ4)
  172. {
  173. num_bytes = 4;
  174. high = (wchar_t)(*p & 0x07);
  175. }
  176. else if ((*p & 0xfc) == _SEQ5)
  177. {
  178. num_bytes = 5;
  179. high = (wchar_t)(*p & 0x03);
  180. }
  181. else if ((*p & 0xfe) == _SEQ6)
  182. {
  183. num_bytes = 6;
  184. high = (wchar_t)(*p & 0x01);
  185. }
  186. else
  187. {
  188. return false;
  189. }
  190. // Does the sequence header tell us the truth about length?
  191. if (lim - p <= num_bytes - 1)
  192. {
  193. return false;
  194. }
  195. // Validate the sequence. All symbols must have higher bits set to 10xxxxxx.
  196. if (num_bytes > 1)
  197. {
  198. int i;
  199. for (i = 1; i < num_bytes; i++)
  200. {
  201. if ((p[i] & 0xc0) != _NXT)
  202. break;
  203. }
  204. if (i != num_bytes)
  205. {
  206. return false;
  207. }
  208. }
  209. // Make up a single UCS-4 (32-bit) character from the required number of UTF-8 tokens. The first byte has
  210. // been determined earlier, the second and subsequent bytes contribute the first six of their bits into the
  211. // final character code.
  212. unsigned int ucs4_char = 0;
  213. int num_bits = 0;
  214. for (int i = 1; i < num_bytes; i++)
  215. {
  216. ucs4_char |= (word)(p[num_bytes - i] & 0x3f) << num_bits;
  217. num_bits += 6;
  218. }
  219. ucs4_char |= high << num_bits;
  220. // Check for surrogate pairs.
  221. if (__wchar_forbidden(ucs4_char) != 0)
  222. {
  223. return false;
  224. }
  225. // Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
  226. // word).
  227. if (ucs4_char <= 0xffff)
  228. output.push_back((word) ucs4_char);
  229. }
  230. output.push_back(0);
  231. return true;
  232. }
  233. // Converts a vector of words in UCS-2 encoding a character array in UTF-8 encoding.
  234. bool StringUtilities::UCS2toUTF8(const std::vector< word >& input, String& output)
  235. {
  236. return UCS2toUTF8(&input[0], input.size(), output);
  237. }
  238. // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
  239. bool StringUtilities::UCS2toUTF8(const word* input, size_t input_size, String& output)
  240. {
  241. unsigned char *oc;
  242. size_t n;
  243. word* w = (word*) input;
  244. word* wlim = w + input_size;
  245. //Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
  246. for (; w < wlim; w++)
  247. {
  248. if (__wchar_forbidden(*w) != 0)
  249. return false;
  250. if (*w == _BOM)
  251. continue;
  252. //if (*w < 0)
  253. // return false;
  254. if (*w <= 0x007f)
  255. n = 1;
  256. else if (*w <= 0x07ff)
  257. n = 2;
  258. else //if (*w <= 0x0000ffff)
  259. n = 3;
  260. /*else if (*w <= 0x001fffff)
  261. n = 4;
  262. else if (*w <= 0x03ffffff)
  263. n = 5;
  264. else // if (*w <= 0x7fffffff)
  265. n = 6;*/
  266. // Convert to little endian.
  267. word ch = (*w >> 8) & 0x00FF;
  268. ch |= (*w << 8) & 0xFF00;
  269. // word ch = EMPConvertEndian(*w, RMLUI_ENDIAN_BIG);
  270. oc = (unsigned char *)&ch;
  271. switch (n)
  272. {
  273. case 1:
  274. output += oc[1];
  275. break;
  276. case 2:
  277. output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
  278. output += (_NXT | (oc[1] & 0x3f));
  279. break;
  280. case 3:
  281. output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
  282. output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
  283. output += (_NXT | (oc[1] & 0x3f));
  284. break;
  285. case 4:
  286. break;
  287. case 5:
  288. break;
  289. case 6:
  290. break;
  291. }
  292. //Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.Length());
  293. }
  294. return true;
  295. }
  296. // Strip whitespace characters from the beginning and end of a string.
  297. String StringUtilities::StripWhitespace(const String& string)
  298. {
  299. const char* start = string.CString();
  300. const char* end = start + string.Length();
  301. while (start < end && IsWhitespace(*start))
  302. start++;
  303. while (end > start && IsWhitespace(*(end - 1)))
  304. end--;
  305. if (start < end)
  306. return String(start, end);
  307. return String();
  308. }
  309. // Operators for STL containers using strings.
  310. bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
  311. {
  312. return strcasecmp(lhs.CString(), rhs.CString()) < 0;
  313. }
  314. }
  315. }