StringUtilities.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. * This source file is part of libRocket, the HTML/CSS Interface Middleware
  3. *
  4. * For the latest information, see http://www.librocket.com
  5. *
  6. * Copyright (c) 2008-2010 CodePoint Ltd, Shift Technology Ltd
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. *
  26. */
  27. #include "precompiled.h"
  28. #include "../../Include/Rocket/Core/StringUtilities.h"
  29. #include <ctype.h>
  30. #include <stdio.h>
  31. namespace Rocket {
  32. namespace Core {
  33. // Expands character-delimited list of values in a single string to a whitespace-trimmed list of values.
  34. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter)
  35. {
  36. char quote = 0;
  37. bool last_char_delimiter = true;
  38. const char* ptr = string.CString();
  39. const char* start_ptr = NULL;
  40. const char* end_ptr = ptr;
  41. while (*ptr)
  42. {
  43. // Switch into quote mode if the last char was a delimeter ( excluding whitespace )
  44. // and we're not already in quote mode
  45. if (last_char_delimiter && !quote && (*ptr == '"' || *ptr == '\''))
  46. {
  47. quote = *ptr;
  48. }
  49. // Switch out of quote mode if we encounter a quote that hasn't been escaped
  50. else if (*ptr == quote && *(ptr-1) != '\\')
  51. {
  52. quote = 0;
  53. }
  54. // If we encouter a delimiter while not in quote mode, add the item to the list
  55. else if (*ptr == delimiter && !quote)
  56. {
  57. if (start_ptr)
  58. string_list.push_back(String(start_ptr, end_ptr + 1));
  59. else
  60. string_list.push_back("");
  61. last_char_delimiter = true;
  62. start_ptr = NULL;
  63. }
  64. // Otherwise if its not white space or we're in quote mode, advance the pointers
  65. else if (!isspace(*ptr) || quote)
  66. {
  67. if (!start_ptr)
  68. start_ptr = ptr;
  69. end_ptr = ptr;
  70. last_char_delimiter = false;
  71. }
  72. ptr++;
  73. }
  74. // If there's data pending, add it.
  75. if (start_ptr)
  76. string_list.push_back(String(start_ptr, end_ptr + 1));
  77. }
  78. // Joins a list of string values into a single string separated by a character delimiter.
  79. void StringUtilities::JoinString(String& string, const StringList& string_list, const char delimiter)
  80. {
  81. for (size_t i = 0; i < string_list.size(); i++)
  82. {
  83. string += string_list[i];
  84. if (delimiter != '\0' && i < string_list.size() - 1)
  85. string.Append(delimiter);
  86. }
  87. }
  88. // Hashes a string of data to an integer value using the FNV algorithm.
  89. Hash StringUtilities::FNVHash(const char *string, int length)
  90. {
  91. // FNV-1 hash algorithm
  92. Hash hval = 0;
  93. unsigned char* bp = (unsigned char *)string; // start of buffer
  94. unsigned char* be = (unsigned char *)string + length;
  95. // FNV-1 hash each octet in the buffer
  96. while (*bp || (length >= 0 && bp < be))
  97. {
  98. // xor the bottom with the current octet
  99. hval ^= *bp++;
  100. /* multiply by the 32 bit FNV magic prime mod 2^32 */
  101. #if !defined(__GNUC__)
  102. const unsigned int FNV_32_PRIME = ((unsigned int)16777619);
  103. hval *= FNV_32_PRIME;
  104. #else
  105. hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
  106. #endif
  107. }
  108. return hval;
  109. }
  110. // Defines, helper functions for the UTF8 / UCS2 conversion functions.
  111. #define _NXT 0x80
  112. #define _SEQ2 0xc0
  113. #define _SEQ3 0xe0
  114. #define _SEQ4 0xf0
  115. #define _SEQ5 0xf8
  116. #define _SEQ6 0xfc
  117. #define _BOM 0xfeff
  118. static int __wchar_forbidden(unsigned int sym)
  119. {
  120. // Surrogate pairs
  121. if (sym >= 0xd800 && sym <= 0xdfff)
  122. return -1;
  123. return 0;
  124. }
  125. static int __utf8_forbidden(unsigned char octet)
  126. {
  127. switch (octet)
  128. {
  129. case 0xc0:
  130. case 0xc1:
  131. case 0xf5:
  132. case 0xff:
  133. return -1;
  134. default:
  135. return 0;
  136. }
  137. }
  138. // Converts a character array in UTF-8 encoding to a vector of words.
  139. bool StringUtilities::UTF8toUCS2(const String& input, std::vector< word >& output)
  140. {
  141. if (input.Empty())
  142. return true;
  143. unsigned char* p = (unsigned char*) input.CString();
  144. unsigned char* lim = p + input.Length();
  145. // Skip the UTF-8 byte order marker if it exists.
  146. if (input.Substring(0, 3) == "\xEF\xBB\xBF")
  147. p += 3;
  148. int num_bytes;
  149. for (; p < lim; p += num_bytes)
  150. {
  151. if (__utf8_forbidden(*p) != 0)
  152. return false;
  153. // Get number of bytes for one wide character.
  154. word high;
  155. num_bytes = 1;
  156. if ((*p & 0x80) == 0)
  157. {
  158. high = (wchar_t)*p;
  159. }
  160. else if ((*p & 0xe0) == _SEQ2)
  161. {
  162. num_bytes = 2;
  163. high = (wchar_t)(*p & 0x1f);
  164. }
  165. else if ((*p & 0xf0) == _SEQ3)
  166. {
  167. num_bytes = 3;
  168. high = (wchar_t)(*p & 0x0f);
  169. }
  170. else if ((*p & 0xf8) == _SEQ4)
  171. {
  172. num_bytes = 4;
  173. high = (wchar_t)(*p & 0x07);
  174. }
  175. else if ((*p & 0xfc) == _SEQ5)
  176. {
  177. num_bytes = 5;
  178. high = (wchar_t)(*p & 0x03);
  179. }
  180. else if ((*p & 0xfe) == _SEQ6)
  181. {
  182. num_bytes = 6;
  183. high = (wchar_t)(*p & 0x01);
  184. }
  185. else
  186. {
  187. return false;
  188. }
  189. // Does the sequence header tell us the truth about length?
  190. if (lim - p <= num_bytes - 1)
  191. {
  192. return false;
  193. }
  194. // Validate the sequence. All symbols must have higher bits set to 10xxxxxx.
  195. if (num_bytes > 1)
  196. {
  197. int i;
  198. for (i = 1; i < num_bytes; i++)
  199. {
  200. if ((p[i] & 0xc0) != _NXT)
  201. break;
  202. }
  203. if (i != num_bytes)
  204. {
  205. return false;
  206. }
  207. }
  208. // Make up a single UCS-4 (32-bit) character from the required number of UTF-8 tokens. The first byte has
  209. // been determined earlier, the second and subsequent bytes contribute the first six of their bits into the
  210. // final character code.
  211. unsigned int ucs4_char = 0;
  212. int num_bits = 0;
  213. for (int i = 1; i < num_bytes; i++)
  214. {
  215. ucs4_char |= (word)(p[num_bytes - i] & 0x3f) << num_bits;
  216. num_bits += 6;
  217. }
  218. ucs4_char |= high << num_bits;
  219. // Check for surrogate pairs.
  220. if (__wchar_forbidden(ucs4_char) != 0)
  221. {
  222. return false;
  223. }
  224. // Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
  225. // word).
  226. if (ucs4_char <= 0xffff)
  227. output.push_back((word) ucs4_char);
  228. }
  229. output.push_back(0);
  230. return true;
  231. }
  232. // Converts a vector of words in UCS-2 encoding a character array in UTF-8 encoding.
  233. bool StringUtilities::UCS2toUTF8(const std::vector< word >& input, String& output)
  234. {
  235. return UCS2toUTF8(&input[0], input.size(), output);
  236. }
  237. // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
  238. bool StringUtilities::UCS2toUTF8(const word* input, size_t input_size, String& output)
  239. {
  240. unsigned char *oc;
  241. size_t n;
  242. word* w = (word*) input;
  243. word* wlim = w + input_size;
  244. //Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
  245. for (; w < wlim; w++)
  246. {
  247. if (__wchar_forbidden(*w) != 0)
  248. return false;
  249. if (*w == _BOM)
  250. continue;
  251. //if (*w < 0)
  252. // return false;
  253. if (*w <= 0x007f)
  254. n = 1;
  255. else if (*w <= 0x07ff)
  256. n = 2;
  257. else //if (*w <= 0x0000ffff)
  258. n = 3;
  259. /*else if (*w <= 0x001fffff)
  260. n = 4;
  261. else if (*w <= 0x03ffffff)
  262. n = 5;
  263. else // if (*w <= 0x7fffffff)
  264. n = 6;*/
  265. // Convert to little endian.
  266. word ch = (*w >> 8) & 0x00FF;
  267. ch |= (*w << 8) & 0xFF00;
  268. // word ch = EMPConvertEndian(*w, ROCKET_ENDIAN_BIG);
  269. oc = (unsigned char *)&ch;
  270. switch (n)
  271. {
  272. case 1:
  273. output += oc[1];
  274. break;
  275. case 2:
  276. output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
  277. output += (_NXT | (oc[1] & 0x3f));
  278. break;
  279. case 3:
  280. output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
  281. output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
  282. output += (_NXT | (oc[1] & 0x3f));
  283. break;
  284. case 4:
  285. break;
  286. case 5:
  287. break;
  288. case 6:
  289. break;
  290. }
  291. //Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.Length());
  292. }
  293. return true;
  294. }
  295. // Strip whitespace characters from the beginning and end of a string.
  296. String StringUtilities::StripWhitespace(const String& string)
  297. {
  298. const char* start = string.CString();
  299. const char* end = start + string.Length();
  300. while (start < end && IsWhitespace(*start))
  301. start++;
  302. while (end > start && IsWhitespace(*(end - 1)))
  303. end--;
  304. if (start < end)
  305. return String(start, end);
  306. return String();
  307. }
  308. // Operators for STL containers using strings.
  309. bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
  310. {
  311. return strcasecmp(lhs.CString(), rhs.CString()) < 0;
  312. }
  313. }
  314. }