StringUtilities.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. /*
  2. * This source file is part of libRocket, the HTML/CSS Interface Middleware
  3. *
  4. * For the latest information, see http://www.librocket.com
  5. *
  6. * Copyright (c) 2008-2010 CodePoint Ltd, Shift Technology Ltd
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. *
  26. */
  27. #include "precompiled.h"
  28. #include "../../Include/Rocket/Core/StringUtilities.h"
  29. #include <ctype.h>
  30. #include <stdio.h>
  31. namespace Rocket {
  32. namespace Core {
  33. // Expands character-delimited list of values in a single string to a whitespace-trimmed list of values.
  34. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter)
  35. {
  36. char quote = 0;
  37. bool last_char_delimiter = true;
  38. const char* ptr = string.c_str();
  39. const char* start_ptr = NULL;
  40. const char* end_ptr = ptr;
  41. size_t num_delimiter_values = std::count(string.begin(), string.end(), delimiter);
  42. if (num_delimiter_values == 0)
  43. {
  44. string_list.push_back(StripWhitespace(string));
  45. return;
  46. }
  47. string_list.reserve(num_delimiter_values + 1);
  48. while (*ptr)
  49. {
  50. // Switch into quote mode if the last char was a delimeter ( excluding whitespace )
  51. // and we're not already in quote mode
  52. if (last_char_delimiter && !quote && (*ptr == '"' || *ptr == '\''))
  53. {
  54. quote = *ptr;
  55. }
  56. // Switch out of quote mode if we encounter a quote that hasn't been escaped
  57. else if (*ptr == quote && *(ptr-1) != '\\')
  58. {
  59. quote = 0;
  60. }
  61. // If we encouter a delimiter while not in quote mode, add the item to the list
  62. else if (*ptr == delimiter && !quote)
  63. {
  64. if (start_ptr)
  65. string_list.emplace_back(start_ptr, end_ptr + 1);
  66. else
  67. string_list.emplace_back();
  68. last_char_delimiter = true;
  69. start_ptr = NULL;
  70. }
  71. // Otherwise if its not white space or we're in quote mode, advance the pointers
  72. else if (!isspace(*ptr) || quote)
  73. {
  74. if (!start_ptr)
  75. start_ptr = ptr;
  76. end_ptr = ptr;
  77. last_char_delimiter = false;
  78. }
  79. ptr++;
  80. }
  81. // If there's data pending, add it.
  82. if (start_ptr)
  83. string_list.emplace_back(start_ptr, end_ptr + 1);
  84. }
  85. // Joins a list of string values into a single string separated by a character delimiter.
  86. void StringUtilities::JoinString(String& string, const StringList& string_list, const char delimiter)
  87. {
  88. for (size_t i = 0; i < string_list.size(); i++)
  89. {
  90. string += string_list[i];
  91. if (delimiter != '\0' && i < string_list.size() - 1)
  92. string += delimiter;
  93. }
  94. }
  95. // Hashes a string of data to an integer value using the FNV algorithm.
  96. Hash StringUtilities::FNVHash(const char *string, int length)
  97. {
  98. // FNV-1 hash algorithm
  99. Hash hval = 0;
  100. unsigned char* bp = (unsigned char *)string; // start of buffer
  101. unsigned char* be = (unsigned char *)string + length;
  102. // FNV-1 hash each octet in the buffer
  103. while (*bp || (length >= 0 && bp < be))
  104. {
  105. // xor the bottom with the current octet
  106. hval ^= *bp++;
  107. /* multiply by the 32 bit FNV magic prime mod 2^32 */
  108. #if !defined(__GNUC__)
  109. const unsigned int FNV_32_PRIME = ((unsigned int)16777619);
  110. hval *= FNV_32_PRIME;
  111. #else
  112. hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
  113. #endif
  114. }
  115. return hval;
  116. }
  117. // Defines, helper functions for the UTF8 / UCS2 conversion functions.
  118. #define _NXT 0x80
  119. #define _SEQ2 0xc0
  120. #define _SEQ3 0xe0
  121. #define _SEQ4 0xf0
  122. #define _SEQ5 0xf8
  123. #define _SEQ6 0xfc
  124. #define _BOM 0xfeff
  125. static int __wchar_forbidden(unsigned int sym)
  126. {
  127. // Surrogate pairs
  128. if (sym >= 0xd800 && sym <= 0xdfff)
  129. return -1;
  130. return 0;
  131. }
  132. static int __utf8_forbidden(unsigned char octet)
  133. {
  134. switch (octet)
  135. {
  136. case 0xc0:
  137. case 0xc1:
  138. case 0xf5:
  139. case 0xff:
  140. return -1;
  141. default:
  142. return 0;
  143. }
  144. }
  145. // Converts a character array in UTF-8 encoding to a vector of words.
  146. bool StringUtilities::UTF8toUCS2(const String& input, WString& output)
  147. {
  148. if (input.empty())
  149. return true;
  150. output.reserve(input.size());
  151. unsigned char* p = (unsigned char*) input.c_str();
  152. unsigned char* lim = p + input.size();
  153. // Skip the UTF-8 byte order marker if it exists.
  154. if (input.substr(0, 3) == "\xEF\xBB\xBF")
  155. p += 3;
  156. int num_bytes;
  157. for (; p < lim; p += num_bytes)
  158. {
  159. if (__utf8_forbidden(*p) != 0)
  160. return false;
  161. // Get number of bytes for one wide character.
  162. word high;
  163. num_bytes = 1;
  164. if ((*p & 0x80) == 0)
  165. {
  166. high = (wchar_t)*p;
  167. }
  168. else if ((*p & 0xe0) == _SEQ2)
  169. {
  170. num_bytes = 2;
  171. high = (wchar_t)(*p & 0x1f);
  172. }
  173. else if ((*p & 0xf0) == _SEQ3)
  174. {
  175. num_bytes = 3;
  176. high = (wchar_t)(*p & 0x0f);
  177. }
  178. else if ((*p & 0xf8) == _SEQ4)
  179. {
  180. num_bytes = 4;
  181. high = (wchar_t)(*p & 0x07);
  182. }
  183. else if ((*p & 0xfc) == _SEQ5)
  184. {
  185. num_bytes = 5;
  186. high = (wchar_t)(*p & 0x03);
  187. }
  188. else if ((*p & 0xfe) == _SEQ6)
  189. {
  190. num_bytes = 6;
  191. high = (wchar_t)(*p & 0x01);
  192. }
  193. else
  194. {
  195. return false;
  196. }
  197. // Does the sequence header tell us the truth about length?
  198. if (lim - p <= num_bytes - 1)
  199. {
  200. return false;
  201. }
  202. // Validate the sequence. All symbols must have higher bits set to 10xxxxxx.
  203. if (num_bytes > 1)
  204. {
  205. int i;
  206. for (i = 1; i < num_bytes; i++)
  207. {
  208. if ((p[i] & 0xc0) != _NXT)
  209. break;
  210. }
  211. if (i != num_bytes)
  212. {
  213. return false;
  214. }
  215. }
  216. // Make up a single UCS-4 (32-bit) character from the required number of UTF-8 tokens. The first byte has
  217. // been determined earlier, the second and subsequent bytes contribute the first six of their bits into the
  218. // final character code.
  219. unsigned int ucs4_char = 0;
  220. int num_bits = 0;
  221. for (int i = 1; i < num_bytes; i++)
  222. {
  223. ucs4_char |= (word)(p[num_bytes - i] & 0x3f) << num_bits;
  224. num_bits += 6;
  225. }
  226. ucs4_char |= high << num_bits;
  227. // Check for surrogate pairs.
  228. if (__wchar_forbidden(ucs4_char) != 0)
  229. {
  230. return false;
  231. }
  232. // Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
  233. // word).
  234. if (ucs4_char <= 0xffff)
  235. output.push_back((word) ucs4_char);
  236. }
  237. return true;
  238. }
  239. // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
  240. bool StringUtilities::UCS2toUTF8(const WString& input, String& output)
  241. {
  242. unsigned char *oc;
  243. size_t n;
  244. output.reserve(input.size());
  245. const word* w = input.data();
  246. const word* wlim = w + input.size();
  247. //Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
  248. for (; w < wlim; w++)
  249. {
  250. if (__wchar_forbidden(*w) != 0)
  251. return false;
  252. if (*w == _BOM)
  253. continue;
  254. //if (*w < 0)
  255. // return false;
  256. if (*w <= 0x007f)
  257. n = 1;
  258. else if (*w <= 0x07ff)
  259. n = 2;
  260. else //if (*w <= 0x0000ffff)
  261. n = 3;
  262. /*else if (*w <= 0x001fffff)
  263. n = 4;
  264. else if (*w <= 0x03ffffff)
  265. n = 5;
  266. else // if (*w <= 0x7fffffff)
  267. n = 6;*/
  268. // Convert to little endian.
  269. word ch = (*w >> 8) & 0x00FF;
  270. ch |= (*w << 8) & 0xFF00;
  271. // word ch = EMPConvertEndian(*w, ROCKET_ENDIAN_BIG);
  272. oc = (unsigned char *)&ch;
  273. switch (n)
  274. {
  275. case 1:
  276. output += oc[1];
  277. break;
  278. case 2:
  279. output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
  280. output += (_NXT | (oc[1] & 0x3f));
  281. break;
  282. case 3:
  283. output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
  284. output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
  285. output += (_NXT | (oc[1] & 0x3f));
  286. break;
  287. case 4:
  288. break;
  289. case 5:
  290. break;
  291. case 6:
  292. break;
  293. }
  294. //Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.size());
  295. }
  296. return true;
  297. }
  298. // Strip whitespace characters from the beginning and end of a string.
  299. String StringUtilities::StripWhitespace(const String& string)
  300. {
  301. const char* start = string.c_str();
  302. const char* end = start + string.size();
  303. while (start < end && IsWhitespace(*start))
  304. start++;
  305. while (end > start && IsWhitespace(*(end - 1)))
  306. end--;
  307. if (start < end)
  308. return String(start, end);
  309. return String();
  310. }
  311. // Operators for STL containers using strings.
  312. bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
  313. {
  314. return strcasecmp(lhs.c_str(), rhs.c_str()) < 0;
  315. }
  316. }
  317. }