StringUtilities.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. #include "..\..\Include\RmlUi\Core\StringUtilities.h"
  2. #include "..\..\Include\RmlUi\Core\StringUtilities.h"
  3. /*
  4. * This source file is part of RmlUi, the HTML/CSS Interface Middleware
  5. *
  6. * For the latest information, see http://github.com/mikke89/RmlUi
  7. *
  8. * Copyright (c) 2008-2010 CodePoint Ltd, Shift Technology Ltd
  9. * Copyright (c) 2019 The RmlUi Team, and contributors
  10. *
  11. * Permission is hereby granted, free of charge, to any person obtaining a copy
  12. * of this software and associated documentation files (the "Software"), to deal
  13. * in the Software without restriction, including without limitation the rights
  14. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15. * copies of the Software, and to permit persons to whom the Software is
  16. * furnished to do so, subject to the following conditions:
  17. *
  18. * The above copyright notice and this permission notice shall be included in
  19. * all copies or substantial portions of the Software.
  20. *
  21. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  24. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  27. * THE SOFTWARE.
  28. *
  29. */
  30. #include "precompiled.h"
  31. #include "../../Include/RmlUi/Core/StringUtilities.h"
  32. #include <ctype.h>
  33. #include <stdio.h>
  34. #include <stdarg.h>
  35. namespace Rml {
  36. namespace Core {
  37. static bool UTF8toUCS2(const String& input, WString& output);
  38. static bool UCS2toUTF8(const WString& input, String& output);
  39. static int FormatString(String& string, size_t max_size, const char* format, va_list argument_list)
  40. {
  41. const int INTERNAL_BUFFER_SIZE = 1024;
  42. static char buffer[INTERNAL_BUFFER_SIZE];
  43. char* buffer_ptr = buffer;
  44. if (max_size + 1 > INTERNAL_BUFFER_SIZE)
  45. buffer_ptr = new char[max_size + 1];
  46. int length = vsnprintf(buffer_ptr, max_size, format, argument_list);
  47. buffer_ptr[length >= 0 ? length : max_size] = '\0';
  48. #ifdef RMLUI_DEBUG
  49. if (length == -1)
  50. {
  51. Log::Message(Log::LT_WARNING, "FormatString: String truncated to %d bytes when processing %s", max_size, format);
  52. }
  53. #endif
  54. string = buffer_ptr;
  55. if (buffer_ptr != buffer)
  56. delete[] buffer_ptr;
  57. return length;
  58. }
  59. int FormatString(String& string, size_t max_size, const char* format, ...)
  60. {
  61. va_list argument_list;
  62. va_start(argument_list, format);
  63. int result = FormatString(string, (int)max_size, format, argument_list);
  64. va_end(argument_list);
  65. return result;
  66. }
  67. String CreateString(size_t max_size, const char* format, ...)
  68. {
  69. String result;
  70. result.reserve(max_size);
  71. va_list argument_list;
  72. va_start(argument_list, format);
  73. FormatString(result, max_size, format, argument_list);
  74. va_end(argument_list);
  75. return result;
  76. }
  77. String StringUtilities::ToLower(const String& string) {
  78. String str_lower = string;
  79. std::transform(str_lower.begin(), str_lower.end(), str_lower.begin(), ::tolower);
  80. return str_lower;
  81. }
  82. WString StringUtilities::ToUCS2(const String& str)
  83. {
  84. WString result;
  85. if (!UTF8toUCS2(str, result))
  86. Log::Message(Log::LT_WARNING, "Failed to convert UTF8 string to UCS2.");
  87. return result;
  88. }
  89. WString StringUtilities::ToUTF16(const String& str)
  90. {
  91. // TODO: Convert to UTF16 instead of UCS2
  92. return ToUCS2(str);
  93. }
  94. String StringUtilities::ToUTF8(const WString& wstr)
  95. {
  96. String result;
  97. if(!UCS2toUTF8(wstr, result))
  98. Log::Message(Log::LT_WARNING, "Failed to convert UCS2 string to UTF8.");
  99. return result;
  100. }
  101. int StringUtilities::LengthUTF8(const String& str)
  102. {
  103. // TODO: Actually consider multibyte characters
  104. return (int)str.size();
  105. }
  106. String StringUtilities::Replace(String subject, const String& search, const String& replace)
  107. {
  108. size_t pos = 0;
  109. while ((pos = subject.find(search, pos)) != String::npos) {
  110. subject.replace(pos, search.length(), replace);
  111. pos += replace.length();
  112. }
  113. return subject;
  114. }
  115. String StringUtilities::Replace(String subject, char search, char replace)
  116. {
  117. const size_t size = subject.size();
  118. for (size_t i = 0; i < size; i++)
  119. {
  120. if (subject[i] == search)
  121. subject[i] = replace;
  122. }
  123. return subject;
  124. }
  125. // Expands character-delimited list of values in a single string to a whitespace-trimmed list of values.
  126. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter)
  127. {
  128. char quote = 0;
  129. bool last_char_delimiter = true;
  130. const char* ptr = string.c_str();
  131. const char* start_ptr = nullptr;
  132. const char* end_ptr = ptr;
  133. size_t num_delimiter_values = std::count(string.begin(), string.end(), delimiter);
  134. if (num_delimiter_values == 0)
  135. {
  136. string_list.push_back(StripWhitespace(string));
  137. return;
  138. }
  139. string_list.reserve(string_list.size() + num_delimiter_values + 1);
  140. while (*ptr)
  141. {
  142. // Switch into quote mode if the last char was a delimeter ( excluding whitespace )
  143. // and we're not already in quote mode
  144. if (last_char_delimiter && !quote && (*ptr == '"' || *ptr == '\''))
  145. {
  146. quote = *ptr;
  147. }
  148. // Switch out of quote mode if we encounter a quote that hasn't been escaped
  149. else if (*ptr == quote && *(ptr-1) != '\\')
  150. {
  151. quote = 0;
  152. }
  153. // If we encounter a delimiter while not in quote mode, add the item to the list
  154. else if (*ptr == delimiter && !quote)
  155. {
  156. if (start_ptr)
  157. string_list.emplace_back(start_ptr, end_ptr + 1);
  158. else
  159. string_list.emplace_back();
  160. last_char_delimiter = true;
  161. start_ptr = nullptr;
  162. }
  163. // Otherwise if its not white space or we're in quote mode, advance the pointers
  164. else if (!isspace(*ptr) || quote)
  165. {
  166. if (!start_ptr)
  167. start_ptr = ptr;
  168. end_ptr = ptr;
  169. last_char_delimiter = false;
  170. }
  171. ptr++;
  172. }
  173. // If there's data pending, add it.
  174. if (start_ptr)
  175. string_list.emplace_back(start_ptr, end_ptr + 1);
  176. }
  177. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter, char quote_character, char unquote_character, bool ignore_repeated_delimiters)
  178. {
  179. int quote_mode_depth = 0;
  180. const char* ptr = string.c_str();
  181. const char* start_ptr = nullptr;
  182. const char* end_ptr = ptr;
  183. while (*ptr)
  184. {
  185. // Increment the quote depth for each quote character encountered
  186. if (*ptr == quote_character)
  187. {
  188. ++quote_mode_depth;
  189. }
  190. // And decrement it for every unquote character
  191. else if (*ptr == unquote_character)
  192. {
  193. --quote_mode_depth;
  194. }
  195. // If we encounter a delimiter while not in quote mode, add the item to the list
  196. if (*ptr == delimiter && quote_mode_depth == 0)
  197. {
  198. if (start_ptr)
  199. string_list.emplace_back(start_ptr, end_ptr + 1);
  200. else if(!ignore_repeated_delimiters)
  201. string_list.emplace_back();
  202. start_ptr = nullptr;
  203. }
  204. // Otherwise if its not white space or we're in quote mode, advance the pointers
  205. else if (!isspace(*ptr) || quote_mode_depth > 0)
  206. {
  207. if (!start_ptr)
  208. start_ptr = ptr;
  209. end_ptr = ptr;
  210. }
  211. ptr++;
  212. }
  213. // If there's data pending, add it.
  214. if (start_ptr)
  215. string_list.emplace_back(start_ptr, end_ptr + 1);
  216. }
  217. // Joins a list of string values into a single string separated by a character delimiter.
  218. void StringUtilities::JoinString(String& string, const StringList& string_list, const char delimiter)
  219. {
  220. for (size_t i = 0; i < string_list.size(); i++)
  221. {
  222. string += string_list[i];
  223. if (delimiter != '\0' && i < string_list.size() - 1)
  224. string += delimiter;
  225. }
  226. }
  227. // Strip whitespace characters from the beginning and end of a string.
  228. String StringUtilities::StripWhitespace(const String& string)
  229. {
  230. const char* start = string.c_str();
  231. const char* end = start + string.size();
  232. while (start < end && IsWhitespace(*start))
  233. start++;
  234. while (end > start && IsWhitespace(*(end - 1)))
  235. end--;
  236. if (start < end)
  237. return String(start, end);
  238. return String();
  239. }
  240. // Operators for STL containers using strings.
  241. bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
  242. {
  243. return strcasecmp(lhs.c_str(), rhs.c_str()) < 0;
  244. }
  245. // Defines, helper functions for the UTF8 / UCS2 conversion functions.
  246. #define _NXT 0x80
  247. #define _SEQ2 0xc0
  248. #define _SEQ3 0xe0
  249. #define _SEQ4 0xf0
  250. #define _SEQ5 0xf8
  251. #define _SEQ6 0xfc
  252. #define _BOM 0xfeff
  253. static int __wchar_forbidden(unsigned int sym)
  254. {
  255. // Surrogate pairs
  256. if (sym >= 0xd800 && sym <= 0xdfff)
  257. return -1;
  258. return 0;
  259. }
  260. static int __utf8_forbidden(unsigned char octet)
  261. {
  262. switch (octet)
  263. {
  264. case 0xc0:
  265. case 0xc1:
  266. case 0xf5:
  267. case 0xff:
  268. return -1;
  269. default:
  270. return 0;
  271. }
  272. }
  273. // Converts a character array in UTF-8 encoding to a vector of words.
  274. static bool UTF8toUCS2(const String& input, WString& output)
  275. {
  276. if (input.empty())
  277. return true;
  278. output.reserve(input.size());
  279. unsigned char* p = (unsigned char*) input.c_str();
  280. unsigned char* lim = p + input.size();
  281. // Skip the UTF-8 byte order marker if it exists.
  282. if (input.substr(0, 3) == "\xEF\xBB\xBF")
  283. p += 3;
  284. int num_bytes;
  285. for (; p < lim; p += num_bytes)
  286. {
  287. if (__utf8_forbidden(*p) != 0)
  288. return false;
  289. // Get number of bytes for one wide character.
  290. word high;
  291. num_bytes = 1;
  292. if ((*p & 0x80) == 0)
  293. {
  294. high = (wchar_t)*p;
  295. }
  296. else if ((*p & 0xe0) == _SEQ2)
  297. {
  298. num_bytes = 2;
  299. high = (wchar_t)(*p & 0x1f);
  300. }
  301. else if ((*p & 0xf0) == _SEQ3)
  302. {
  303. num_bytes = 3;
  304. high = (wchar_t)(*p & 0x0f);
  305. }
  306. else if ((*p & 0xf8) == _SEQ4)
  307. {
  308. num_bytes = 4;
  309. high = (wchar_t)(*p & 0x07);
  310. }
  311. else if ((*p & 0xfc) == _SEQ5)
  312. {
  313. num_bytes = 5;
  314. high = (wchar_t)(*p & 0x03);
  315. }
  316. else if ((*p & 0xfe) == _SEQ6)
  317. {
  318. num_bytes = 6;
  319. high = (wchar_t)(*p & 0x01);
  320. }
  321. else
  322. {
  323. return false;
  324. }
  325. // Does the sequence header tell us the truth about length?
  326. if (lim - p <= num_bytes - 1)
  327. {
  328. return false;
  329. }
  330. // Validate the sequence. All symbols must have higher bits set to 10xxxxxx.
  331. if (num_bytes > 1)
  332. {
  333. int i;
  334. for (i = 1; i < num_bytes; i++)
  335. {
  336. if ((p[i] & 0xc0) != _NXT)
  337. break;
  338. }
  339. if (i != num_bytes)
  340. {
  341. return false;
  342. }
  343. }
  344. // Make up a single UCS-4 (32-bit) character from the required number of UTF-8 tokens. The first byte has
  345. // been determined earlier, the second and subsequent bytes contribute the first six of their bits into the
  346. // final character code.
  347. unsigned int ucs4_char = 0;
  348. int num_bits = 0;
  349. for (int i = 1; i < num_bytes; i++)
  350. {
  351. ucs4_char |= (word)(p[num_bytes - i] & 0x3f) << num_bits;
  352. num_bits += 6;
  353. }
  354. ucs4_char |= high << num_bits;
  355. // Check for surrogate pairs.
  356. if (__wchar_forbidden(ucs4_char) != 0)
  357. {
  358. return false;
  359. }
  360. // Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
  361. // word).
  362. if (ucs4_char <= 0xffff)
  363. output.push_back((word) ucs4_char);
  364. }
  365. return true;
  366. }
  367. // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
  368. static bool UCS2toUTF8(const WString& input, String& output)
  369. {
  370. unsigned char *oc;
  371. size_t n;
  372. output.reserve(input.size());
  373. const word* w = input.data();
  374. const word* wlim = w + input.size();
  375. //Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
  376. for (; w < wlim; w++)
  377. {
  378. if (__wchar_forbidden(*w) != 0)
  379. return false;
  380. if (*w == _BOM)
  381. continue;
  382. //if (*w < 0)
  383. // return false;
  384. if (*w <= 0x007f)
  385. n = 1;
  386. else if (*w <= 0x07ff)
  387. n = 2;
  388. else //if (*w <= 0x0000ffff)
  389. n = 3;
  390. /*else if (*w <= 0x001fffff)
  391. n = 4;
  392. else if (*w <= 0x03ffffff)
  393. n = 5;
  394. else // if (*w <= 0x7fffffff)
  395. n = 6;*/
  396. // Convert to little endian.
  397. word ch = (*w >> 8) & 0x00FF;
  398. ch |= (*w << 8) & 0xFF00;
  399. // word ch = EMPConvertEndian(*w, RMLUI_ENDIAN_BIG);
  400. oc = (unsigned char *)&ch;
  401. switch (n)
  402. {
  403. case 1:
  404. output += oc[1];
  405. break;
  406. case 2:
  407. output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
  408. output += (_NXT | (oc[1] & 0x3f));
  409. break;
  410. case 3:
  411. output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
  412. output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
  413. output += (_NXT | (oc[1] & 0x3f));
  414. break;
  415. case 4:
  416. break;
  417. case 5:
  418. break;
  419. case 6:
  420. break;
  421. }
  422. //Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.size());
  423. }
  424. return true;
  425. }
  426. }
  427. }