StringUtilities.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. #include "..\..\Include\RmlUi\Core\StringUtilities.h"
  2. #include "..\..\Include\RmlUi\Core\StringUtilities.h"
  3. #include "..\..\Include\RmlUi\Core\StringUtilities.h"
  4. /*
  5. * This source file is part of RmlUi, the HTML/CSS Interface Middleware
  6. *
  7. * For the latest information, see http://github.com/mikke89/RmlUi
  8. *
  9. * Copyright (c) 2008-2010 CodePoint Ltd, Shift Technology Ltd
  10. * Copyright (c) 2019 The RmlUi Team, and contributors
  11. *
  12. * Permission is hereby granted, free of charge, to any person obtaining a copy
  13. * of this software and associated documentation files (the "Software"), to deal
  14. * in the Software without restriction, including without limitation the rights
  15. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  16. * copies of the Software, and to permit persons to whom the Software is
  17. * furnished to do so, subject to the following conditions:
  18. *
  19. * The above copyright notice and this permission notice shall be included in
  20. * all copies or substantial portions of the Software.
  21. *
  22. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  23. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  24. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  25. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  26. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  27. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  28. * THE SOFTWARE.
  29. *
  30. */
  31. #include "precompiled.h"
  32. #include "../../Include/RmlUi/Core/StringUtilities.h"
  33. #include <ctype.h>
  34. #include <stdio.h>
  35. #include <stdarg.h>
  36. namespace Rml {
  37. namespace Core {
  38. static bool UTF8toUCS2(const String& input, WString& output);
  39. static bool UCS2toUTF8(const WString& input, String& output);
  40. static int FormatString(String& string, size_t max_size, const char* format, va_list argument_list)
  41. {
  42. const int INTERNAL_BUFFER_SIZE = 1024;
  43. static char buffer[INTERNAL_BUFFER_SIZE];
  44. char* buffer_ptr = buffer;
  45. if (max_size + 1 > INTERNAL_BUFFER_SIZE)
  46. buffer_ptr = new char[max_size + 1];
  47. int length = vsnprintf(buffer_ptr, max_size, format, argument_list);
  48. buffer_ptr[length >= 0 ? length : max_size] = '\0';
  49. #ifdef RMLUI_DEBUG
  50. if (length == -1)
  51. {
  52. Log::Message(Log::LT_WARNING, "FormatString: String truncated to %d bytes when processing %s", max_size, format);
  53. }
  54. #endif
  55. string = buffer_ptr;
  56. if (buffer_ptr != buffer)
  57. delete[] buffer_ptr;
  58. return length;
  59. }
  60. int FormatString(String& string, size_t max_size, const char* format, ...)
  61. {
  62. va_list argument_list;
  63. va_start(argument_list, format);
  64. int result = FormatString(string, (int)max_size, format, argument_list);
  65. va_end(argument_list);
  66. return result;
  67. }
  68. String CreateString(size_t max_size, const char* format, ...)
  69. {
  70. String result;
  71. result.reserve(max_size);
  72. va_list argument_list;
  73. va_start(argument_list, format);
  74. FormatString(result, max_size, format, argument_list);
  75. va_end(argument_list);
  76. return result;
  77. }
  78. String StringUtilities::ToLower(const String& string) {
  79. String str_lower = string;
  80. std::transform(str_lower.begin(), str_lower.end(), str_lower.begin(), ::tolower);
  81. return str_lower;
  82. }
  83. WString StringUtilities::ToUCS2(const String& str)
  84. {
  85. WString result;
  86. if (!UTF8toUCS2(str, result))
  87. Log::Message(Log::LT_WARNING, "Failed to convert UTF8 string to UCS2.");
  88. return result;
  89. }
  90. WString StringUtilities::ToUTF16(const String& str)
  91. {
  92. // TODO: Convert to UTF16 instead of UCS2
  93. return ToUCS2(str);
  94. }
  95. String StringUtilities::ToUTF8(const WString& wstr)
  96. {
  97. /// TODO: Convert from UTF-16 instead.
  98. String result;
  99. if(!UCS2toUTF8(wstr, result))
  100. Log::Message(Log::LT_WARNING, "Failed to convert UCS2 string to UTF8.");
  101. return result;
  102. }
  103. size_t StringUtilities::LengthU8(const String& str)
  104. {
  105. const char* p = str.data();
  106. const char* p_end = str.data() + str.size();
  107. size_t num_continuation_bytes = 0;
  108. while (p != p_end)
  109. {
  110. if ((*p & 0b1100'0000) == 0b1000'0000)
  111. ++num_continuation_bytes;
  112. ++p;
  113. }
  114. return str.size() - num_continuation_bytes;
  115. }
  116. String StringUtilities::Replace(String subject, const String& search, const String& replace)
  117. {
  118. size_t pos = 0;
  119. while ((pos = subject.find(search, pos)) != String::npos) {
  120. subject.replace(pos, search.length(), replace);
  121. pos += replace.length();
  122. }
  123. return subject;
  124. }
  125. String StringUtilities::Replace(String subject, char search, char replace)
  126. {
  127. const size_t size = subject.size();
  128. for (size_t i = 0; i < size; i++)
  129. {
  130. if (subject[i] == search)
  131. subject[i] = replace;
  132. }
  133. return subject;
  134. }
  135. // Expands character-delimited list of values in a single string to a whitespace-trimmed list of values.
  136. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter)
  137. {
  138. char quote = 0;
  139. bool last_char_delimiter = true;
  140. const char* ptr = string.c_str();
  141. const char* start_ptr = nullptr;
  142. const char* end_ptr = ptr;
  143. size_t num_delimiter_values = std::count(string.begin(), string.end(), delimiter);
  144. if (num_delimiter_values == 0)
  145. {
  146. string_list.push_back(StripWhitespace(string));
  147. return;
  148. }
  149. string_list.reserve(string_list.size() + num_delimiter_values + 1);
  150. while (*ptr)
  151. {
  152. // Switch into quote mode if the last char was a delimeter ( excluding whitespace )
  153. // and we're not already in quote mode
  154. if (last_char_delimiter && !quote && (*ptr == '"' || *ptr == '\''))
  155. {
  156. quote = *ptr;
  157. }
  158. // Switch out of quote mode if we encounter a quote that hasn't been escaped
  159. else if (*ptr == quote && *(ptr-1) != '\\')
  160. {
  161. quote = 0;
  162. }
  163. // If we encounter a delimiter while not in quote mode, add the item to the list
  164. else if (*ptr == delimiter && !quote)
  165. {
  166. if (start_ptr)
  167. string_list.emplace_back(start_ptr, end_ptr + 1);
  168. else
  169. string_list.emplace_back();
  170. last_char_delimiter = true;
  171. start_ptr = nullptr;
  172. }
  173. // Otherwise if its not white space or we're in quote mode, advance the pointers
  174. else if (!isspace(*ptr) || quote)
  175. {
  176. if (!start_ptr)
  177. start_ptr = ptr;
  178. end_ptr = ptr;
  179. last_char_delimiter = false;
  180. }
  181. ptr++;
  182. }
  183. // If there's data pending, add it.
  184. if (start_ptr)
  185. string_list.emplace_back(start_ptr, end_ptr + 1);
  186. }
  187. void StringUtilities::ExpandString(StringList& string_list, const String& string, const char delimiter, char quote_character, char unquote_character, bool ignore_repeated_delimiters)
  188. {
  189. int quote_mode_depth = 0;
  190. const char* ptr = string.c_str();
  191. const char* start_ptr = nullptr;
  192. const char* end_ptr = ptr;
  193. while (*ptr)
  194. {
  195. // Increment the quote depth for each quote character encountered
  196. if (*ptr == quote_character)
  197. {
  198. ++quote_mode_depth;
  199. }
  200. // And decrement it for every unquote character
  201. else if (*ptr == unquote_character)
  202. {
  203. --quote_mode_depth;
  204. }
  205. // If we encounter a delimiter while not in quote mode, add the item to the list
  206. if (*ptr == delimiter && quote_mode_depth == 0)
  207. {
  208. if (start_ptr)
  209. string_list.emplace_back(start_ptr, end_ptr + 1);
  210. else if(!ignore_repeated_delimiters)
  211. string_list.emplace_back();
  212. start_ptr = nullptr;
  213. }
  214. // Otherwise if its not white space or we're in quote mode, advance the pointers
  215. else if (!isspace(*ptr) || quote_mode_depth > 0)
  216. {
  217. if (!start_ptr)
  218. start_ptr = ptr;
  219. end_ptr = ptr;
  220. }
  221. ptr++;
  222. }
  223. // If there's data pending, add it.
  224. if (start_ptr)
  225. string_list.emplace_back(start_ptr, end_ptr + 1);
  226. }
  227. // Joins a list of string values into a single string separated by a character delimiter.
  228. void StringUtilities::JoinString(String& string, const StringList& string_list, const char delimiter)
  229. {
  230. for (size_t i = 0; i < string_list.size(); i++)
  231. {
  232. string += string_list[i];
  233. if (delimiter != '\0' && i < string_list.size() - 1)
  234. string += delimiter;
  235. }
  236. }
  237. // Strip whitespace characters from the beginning and end of a string.
  238. String StringUtilities::StripWhitespace(const String& string)
  239. {
  240. const char* start = string.c_str();
  241. const char* end = start + string.size();
  242. while (start < end && IsWhitespace(*start))
  243. start++;
  244. while (end > start && IsWhitespace(*(end - 1)))
  245. end--;
  246. if (start < end)
  247. return String(start, end);
  248. return String();
  249. }
  250. CodePoint StringUtilities::ToCodePoint(const char* p)
  251. {
  252. if ((*p & (1 << 7)) == 0)
  253. return static_cast<CodePoint>(*p);
  254. int num_bytes = 0;
  255. int code = 0;
  256. if ((*p & 0b1110'0000) == 0b1100'0000)
  257. {
  258. num_bytes = 2;
  259. code = (*p & 0b0001'1111);
  260. }
  261. else if ((*p & 0b1111'0000) == 0b1110'0000)
  262. {
  263. num_bytes = 3;
  264. code = (*p & 0b0000'1111);
  265. }
  266. else if ((*p & 0b1111'1000) == 0b1111'0000)
  267. {
  268. num_bytes = 4;
  269. code = (*p & 0b0000'0111);
  270. }
  271. else
  272. {
  273. // Invalid begin byte
  274. return CodePoint::Null;
  275. }
  276. for (int i = 1; i < num_bytes; i++)
  277. {
  278. const char byte = *(p + i);
  279. if ((byte & 0b1100'0000) != 0b1000'0000)
  280. {
  281. // Invalid continuation byte
  282. ++p;
  283. return CodePoint::Null;
  284. }
  285. code |= ((byte & 0b0011'1111) << 8 * i);
  286. }
  287. return static_cast<CodePoint>(code);
  288. }
  289. String StringUtilities::ToUTF8(CodePoint code_point)
  290. {
  291. unsigned int c = (unsigned int)code_point;
  292. constexpr int l3 = 0b0000'0111;
  293. constexpr int l4 = 0b0000'1111;
  294. constexpr int l5 = 0b0001'1111;
  295. constexpr int l6 = 0b0011'1111;
  296. constexpr int h1 = 0b1000'0000;
  297. constexpr int h2 = 0b1100'0000;
  298. constexpr int h3 = 0b1110'0000;
  299. constexpr int h4 = 0b1111'0000;
  300. if (c < 0x80)
  301. return String(1, (char)c);
  302. else if(c < 0x800)
  303. return { char(((c >> 6) & l5) | h2), char((c & l6) | h1) };
  304. else if (c < 0x10000)
  305. return { char(((c >> 12) & l4) | h3), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
  306. else if (c < 0x10000)
  307. return { char(((c >> 18) & l3) | h4), char(((c >> 12) & l6) | h1), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
  308. // Invalid code point
  309. return String();
  310. }
  311. // Operators for STL containers using strings.
  312. bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
  313. {
  314. return strcasecmp(lhs.c_str(), rhs.c_str()) < 0;
  315. }
  316. // Defines, helper functions for the UTF8 / UCS2 conversion functions.
  317. constexpr int _NXT = 0x80;
  318. constexpr int _SEQ2 = 0xc0;
  319. constexpr int _SEQ3 = 0xe0;
  320. constexpr int _SEQ4 = 0xf0;
  321. constexpr int _SEQ5 = 0xf8;
  322. constexpr int _SEQ6 = 0xfc;
  323. constexpr int _BOM = 0xfeff;
  324. static int __wchar_forbidden(unsigned int sym)
  325. {
  326. // Surrogate pairs
  327. if (sym >= 0xd800 && sym <= 0xdfff)
  328. return -1;
  329. return 0;
  330. }
  331. static int __utf8_forbidden(unsigned char octet)
  332. {
  333. switch (octet)
  334. {
  335. case 0xc0:
  336. case 0xc1:
  337. case 0xf5:
  338. case 0xff:
  339. return -1;
  340. default:
  341. return 0;
  342. }
  343. }
  344. // Converts a character array in UTF-8 encoding to a vector of words.
  345. static bool UTF8toUCS2(const String& input, WString& output)
  346. {
  347. if (input.empty())
  348. return true;
  349. output.reserve(input.size());
  350. unsigned char* p = (unsigned char*) input.c_str();
  351. unsigned char* end = p + input.size();
  352. // Skip the UTF-8 byte order marker if it exists.
  353. if (input.substr(0, 3) == "\xEF\xBB\xBF")
  354. p += 3;
  355. int num_bytes;
  356. for (; p < end; p += num_bytes)
  357. {
  358. if (__utf8_forbidden(*p) != 0)
  359. return false;
  360. // Get number of bytes for one wide character.
  361. wchar_t high;
  362. num_bytes = 1;
  363. if ((*p & 0x80) == 0)
  364. {
  365. high = (wchar_t)*p;
  366. }
  367. else if ((*p & 0xe0) == _SEQ2)
  368. {
  369. num_bytes = 2;
  370. high = (wchar_t)(*p & 0x1f);
  371. }
  372. else if ((*p & 0xf0) == _SEQ3)
  373. {
  374. num_bytes = 3;
  375. high = (wchar_t)(*p & 0x0f);
  376. }
  377. else if ((*p & 0xf8) == _SEQ4)
  378. {
  379. num_bytes = 4;
  380. high = (wchar_t)(*p & 0x07);
  381. }
  382. else if ((*p & 0xfc) == _SEQ5)
  383. {
  384. num_bytes = 5;
  385. high = (wchar_t)(*p & 0x03);
  386. }
  387. else if ((*p & 0xfe) == _SEQ6)
  388. {
  389. num_bytes = 6;
  390. high = (wchar_t)(*p & 0x01);
  391. }
  392. else
  393. {
  394. return false;
  395. }
  396. // Does the sequence header tell us the truth about length?
  397. if (end - p <= num_bytes - 1)
  398. {
  399. return false;
  400. }
  401. // Validate the sequence. All symbols must have higher bits set to 10xxxxxx.
  402. if (num_bytes > 1)
  403. {
  404. int i;
  405. for (i = 1; i < num_bytes; i++)
  406. {
  407. if ((p[i] & 0b1100'0000) != _NXT)
  408. break;
  409. }
  410. if (i != num_bytes)
  411. {
  412. return false;
  413. }
  414. }
  415. // Make up a single UCS-4 (32-bit) character from the required number of UTF-8 tokens. The first byte has
  416. // been determined earlier, the second and subsequent bytes contribute the first six of their bits into the
  417. // final character code.
  418. unsigned int ucs4_char = 0;
  419. int num_bits = 0;
  420. for (int i = 1; i < num_bytes; i++)
  421. {
  422. ucs4_char |= (wchar_t)(p[num_bytes - i] & 0x3f) << num_bits;
  423. num_bits += 6;
  424. }
  425. ucs4_char |= high << num_bits;
  426. // Check for surrogate pairs.
  427. if (__wchar_forbidden(ucs4_char) != 0)
  428. {
  429. return false;
  430. }
  431. // Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
  432. // word).
  433. if (ucs4_char <= 0xffff)
  434. output.push_back((wchar_t) ucs4_char);
  435. }
  436. return true;
  437. }
  438. // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
  439. static bool UCS2toUTF8(const WString& input, String& output)
  440. {
  441. unsigned char *oc;
  442. size_t n;
  443. output.reserve(input.size());
  444. const wchar_t* w = input.data();
  445. const wchar_t* wlim = w + input.size();
  446. //Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
  447. for (; w < wlim; w++)
  448. {
  449. if (__wchar_forbidden(*w) != 0)
  450. return false;
  451. if (*w == _BOM)
  452. continue;
  453. //if (*w < 0)
  454. // return false;
  455. if (*w <= 0x007f)
  456. n = 1;
  457. else if (*w <= 0x07ff)
  458. n = 2;
  459. else //if (*w <= 0x0000ffff)
  460. n = 3;
  461. /*else if (*w <= 0x001fffff)
  462. n = 4;
  463. else if (*w <= 0x03ffffff)
  464. n = 5;
  465. else // if (*w <= 0x7fffffff)
  466. n = 6;*/
  467. // Convert to little endian.
  468. wchar_t ch = (*w >> 8) & 0x00FF;
  469. ch |= (*w << 8) & 0xFF00;
  470. // word ch = EMPConvertEndian(*w, RMLUI_ENDIAN_BIG);
  471. oc = (unsigned char *)&ch;
  472. switch (n)
  473. {
  474. case 1:
  475. output += oc[1];
  476. break;
  477. case 2:
  478. output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
  479. output += (_NXT | (oc[1] & 0x3f));
  480. break;
  481. case 3:
  482. output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
  483. output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
  484. output += (_NXT | (oc[1] & 0x3f));
  485. break;
  486. case 4:
  487. break;
  488. case 5:
  489. break;
  490. case 6:
  491. break;
  492. }
  493. //Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.size());
  494. }
  495. return true;
  496. }
  497. }
  498. }