stringAPI.cpp 72 KB


  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Gets access to private members by making them public for the whole module
  24. #define DSR_INTERNAL_ACCESS
  25. #include <iostream>
  26. #include <sstream>
  27. #include <fstream>
  28. #include <streambuf>
  29. #include <thread>
  30. #include <mutex>
  31. #include <stdexcept>
  32. #include <cmath>
  33. #include "stringAPI.h"
  34. #include "../api/fileAPI.h"
  35. #include "../settings.h"
  36. using namespace dsr;
  37. // The print buffer keeps its buffer size from previous printing to avoid reallocating memory every time something is printed.
  38. // It is stored separatelly for each calling thread to avoid conflicts.
  39. static thread_local String printBuffer;
  40. String &dsr::string_getPrintBuffer() {
  41. return printBuffer;
  42. }
  43. static void atomic_append_ascii(String &target, const char* source);
  44. static void atomic_append_readable(String &target, const ReadableString& source);
  45. static void atomic_append_utf32(String &target, const DsrChar* source);
  46. static intptr_t strlen_utf32(const DsrChar *content) {
  47. intptr_t length = 0;
  48. while (content[length] != 0) {
  49. length++;
  50. }
  51. return length;
  52. }
  53. static char toAscii(DsrChar c) {
  54. if (c > 127) {
  55. return '?';
  56. } else {
  57. return c;
  58. }
  59. }
  60. ReadableString::ReadableString(const DsrChar *content)
  61. : view(content, strlen_utf32(content)) {}
  62. String::String() {}
  63. #ifndef BAN_IMPLICIT_ASCII_CONVERSION
  64. String::String(const char* source) { atomic_append_ascii(*this, source); }
  65. #endif
  66. String::String(const DsrChar* source) { atomic_append_utf32(*this, source); }
  67. String dsr::string_fromAscii(const char *text) {
  68. String result;
  69. atomic_append_ascii(result, text);
  70. return result;
  71. }
  72. String& Printable::toStream(String& target) const {
  73. return this->toStreamIndented(target, U"");
  74. }
  75. String Printable::toStringIndented(const ReadableString& indentation) const {
  76. String result;
  77. this->toStreamIndented(result, indentation);
  78. return result;
  79. }
  80. String Printable::toString() const {
  81. return this->toStringIndented(U"");
  82. }
  83. Printable::~Printable() {}
  84. /*
  85. Code generator used to create character transforming functions from arbitrary reference functions.
  86. Paste the result into functions that provide character, odd and even before the returning if statement begins.
  87. static void generateCharacterRange(String &result, DsrChar firstIn, DsrChar lastIn, int64_t stride, int64_t offset) {
  88. DsrChar firstOut = firstIn + offset;
  89. DsrChar lastOut = lastIn + offset;
  90. if (string_length(result) == 0) {
  91. string_append(result, U" ");
  92. } else {
  93. string_append(result, U" } else ");
  94. }
  95. if (firstIn == lastIn) {
  96. string_append(result, U"if (character == U'", firstIn, "') { // ", firstIn, U" (", (uint32_t)firstIn, U")\n");
  97. string_append(result, U" return U'", firstOut, "'; // ", firstOut, U" (", (uint32_t)firstOut, U")\n");
  98. } else {
  99. string_append(result, U"if (U'", firstIn, "' <= character && character <= U'", lastIn, U"'");
  100. if (stride == 2) {
  101. if (firstIn & DsrChar(1)) {
  102. // Odd interval
  103. string_append(result, U" && odd");
  104. } else {
  105. // Even interval
  106. string_append(result, U" && even");
  107. }
  108. } else if (stride != 1) {
  109. throwError(U"Unsupported stride ", stride, U"!\n");
  110. }
  111. string_append(result, U") { // ", firstIn, U" (", (uint32_t)firstIn, U") to ", lastIn, U" (", (uint32_t)lastIn, U")\n");
  112. if (firstOut > firstIn) {
  113. string_append(result, U" return character + ", offset, ";");
  114. } else if (firstOut < firstIn) {
  115. string_append(result, U" return character - ", -offset, ";");
  116. }
  117. string_append(result, U"// ", firstOut, U" (", (uint32_t)firstOut, U") to ", lastOut, U" (", (uint32_t)lastOut, U")\n");
  118. }
  119. }
  120. // Pre-condition: The transform function must change at least one character.
  121. static String generateCharacterMapping(std::function<DsrChar(const DsrChar character)> transform, DsrChar first, DsrChar last) {
  122. String result;
  123. int64_t rangeStart = -1;
  124. int64_t rangeEnd = -1;
  125. int64_t lastOffset = -1;
  126. int64_t currentStride = -1;
  127. for (int64_t c = first; c <= last; c++) {
  128. int64_t t = transform(c);
  129. if (c != t) {
  130. int64_t offset = int64_t(t) - int64_t(c);
  131. int64_t step = c - rangeEnd;
  132. // Check if we should break apart the previous range.
  133. if ((currentStride != -1 && step != currentStride)
  134. || step > 2
  135. || (lastOffset != -1 && offset != lastOffset)) {
  136. if (rangeStart != -1) {
  137. generateCharacterRange(result, rangeStart, rangeEnd, currentStride, lastOffset);
  138. }
  139. rangeStart = c;
  140. rangeEnd = c;
  141. lastOffset = offset;
  142. currentStride = -1;
  143. } else {
  144. rangeEnd = c;
  145. lastOffset = offset;
  146. currentStride = step;
  147. }
  148. }
  149. }
  150. // Generate the last range, while assuming that we have at least one character to modify.
  151. if (rangeStart != -1) {
  152. generateCharacterRange(result, rangeStart, rangeEnd, currentStride, lastOffset);
  153. }
  154. string_append(result, U" } else {\n");
  155. string_append(result, U" return character;\n");
  156. string_append(result, U" }\n");
  157. return result;
  158. }
  159. */
  160. DsrChar dsr::character_upperCase(DsrChar character) {
  161. if (character < 256) {
  162. if (U'a' <= character && character <= U'z') { // a (97) to z (122)
  163. return character - 32; // A (65) to Z (90)
  164. } else if (character == U'ß') { // ß (223)
  165. return U'ẞ'; // ẞ (7838)
  166. } else if (U'à' <= character && character <= U'ö') { // à (224) to ö (246)
  167. return character - 32; // À (192) to Ö (214)
  168. } else if (U'ø' <= character && character <= U'þ') { // ø (248) to þ (254)
  169. return character - 32; // Ø (216) to Þ (222)
  170. } else if (character == U'ÿ') { // ÿ (255)
  171. return U'Ÿ'; // Ÿ (376)
  172. } else {
  173. return character;
  174. }
  175. } else {
  176. bool odd = character & DsrChar(1);
  177. bool even = !odd;
  178. if (U'ā' <= character && character <= U'ķ' && odd) { // ā (257) to ķ (311)
  179. return character - 1; // Ā (256) to Ķ (310)
  180. } else if (U'ĺ' <= character && character <= U'ň' && even) { // ĺ (314) to ň (328)
  181. return character - 1; // Ĺ (313) to Ň (327)
  182. } else if (U'ŋ' <= character && character <= U'ŷ' && odd) { // ŋ (331) to ŷ (375)
  183. return character - 1; // Ŋ (330) toŶ (374)
  184. } else if (U'ź' <= character && character <= U'ž' && even) { // ź (378) to ž (382)
  185. return character - 1; // Ź (377) to Ž (381)
  186. } else if (character == U'ƀ') { // ƀ (384)
  187. return U'Ƀ'; // Ƀ (579)
  188. } else if (character == U'ƃ') { // ƃ (387)
  189. return U'Ƃ'; // Ƃ (386)
  190. } else if (character == U'ƅ') { // ƅ (389)
  191. return U'Ƅ'; // Ƅ (388)
  192. } else if (character == U'ƈ') { // ƈ (392)
  193. return U'Ƈ'; // Ƈ (391)
  194. } else if (character == U'ƌ') { // ƌ (396)
  195. return U'Ƌ'; // Ƌ (395)
  196. } else if (character == U'ƒ') { // ƒ (402)
  197. return U'Ƒ'; // Ƒ (401)
  198. } else if (character == U'ƙ') { // ƙ (409)
  199. return U'Ƙ'; // Ƙ (408)
  200. } else if (character == U'ƚ') { // ƚ (410)
  201. return U'Ƚ'; // Ƚ (573)
  202. } else if (character == U'ƞ') { // ƞ (414)
  203. return U'Ƞ'; // Ƞ (544)
  204. } else if (character == U'ơ') { // ơ (417)
  205. return U'Ơ'; // Ơ (416)
  206. } else if (character == U'ƣ') { // ƣ (419)
  207. return U'Ƣ'; // Ƣ (418)
  208. } else if (character == U'ƥ') { // ƥ (421)
  209. return U'Ƥ'; // Ƥ (420)
  210. } else if (character == U'ƨ') { // ƨ (424)
  211. return U'Ƨ'; // Ƨ (423)
  212. } else if (character == U'Ʃ') { // Ʃ (425)
  213. return U'ʃ'; // ʃ (643)
  214. } else if (character == U'ƭ') { // ƭ (429)
  215. return U'Ƭ'; // Ƭ (428)
  216. } else if (character == U'ư') { // ư (432)
  217. return U'Ư'; // Ư (431)
  218. } else if (character == U'ƴ') { // ƴ (436)
  219. return U'Ƴ'; // Ƴ (435)
  220. } else if (character == U'ƶ') { // ƶ (438)
  221. return U'Ƶ'; // Ƶ (437)
  222. } else if (character == U'ƹ') { // ƹ (441)
  223. return U'Ƹ'; // Ƹ (440)
  224. } else if (character == U'ƽ') { // ƽ (445)
  225. return U'Ƽ'; // Ƽ (444)
  226. } else if (character == U'ƿ') { // ƿ (447)
  227. return U'Ƿ'; // Ƿ (503)
  228. } else if (character == U'Dž') { // Dž (453)
  229. return U'DŽ'; // DŽ (452)
  230. } else if (character == U'dž') { // dž (454)
  231. return U'DŽ'; // DŽ (452)
  232. } else if (character == U'Lj') { // Lj (456)
  233. return U'LJ'; // LJ (455)
  234. } else if (character == U'lj') { // lj (457)
  235. return U'LJ'; // LJ (455)
  236. } else if (character == U'Nj') { // Nj (459)
  237. return U'NJ'; // NJ (458)
  238. } else if (character == U'nj') { // nj (460)
  239. return U'NJ'; // NJ (458)
  240. } else if (U'ǎ' <= character && character <= U'ǜ' && even) { // ǎ (462) to ǜ (476)
  241. return character - 1; // Ǎ (461) to Ǜ (475)
  242. } else if (U'ǟ' <= character && character <= U'ǯ' && odd) { // ǟ (479) to ǯ (495)
  243. return character - 1; // Ǟ (478) to Ǯ (494)
  244. } else if (character == U'Dz') { // Dz (498)
  245. return U'DZ'; // DZ (497)
  246. } else if (character == U'dz') { // dz (499)
  247. return U'DZ'; // DZ (497)
  248. } else if (character == U'ǵ') { // ǵ (501)
  249. return U'Ǵ'; // Ǵ (500)
  250. } else if (U'ǹ' <= character && character <= U'ȟ' && odd) { // ǹ (505) to ȟ (543)
  251. return character - 1;// Ǹ (504) to Ȟ (542)
  252. } else if (U'ȣ' <= character && character <= U'ȳ' && odd) { // ȣ (547) to ȳ (563)
  253. return character - 1;// Ȣ (546) to Ȳ (562)
  254. } else if (character == U'ȼ') { // ȼ (572)
  255. return U'Ȼ'; // Ȼ (571)
  256. } else if (U'ȿ' <= character && character <= U'ɀ') { // ȿ (575) to ɀ (576)
  257. return character + 10815;// Ȿ (11390) to Ɀ (11391)
  258. } else if (character == U'ɂ') { // ɂ (578)
  259. return U'Ɂ'; // Ɂ (577)
  260. } else if (U'ɇ' <= character && character <= U'ɏ' && odd) { // ɇ (583) to ɏ (591)
  261. return character - 1;// Ɇ (582) to Ɏ (590)
  262. } else if (character == U'ɐ') { // ɐ (592)
  263. return U'Ɐ'; // Ɐ (11375)
  264. } else if (character == U'ɑ') { // ɑ (593)
  265. return U'Ɑ'; // Ɑ (11373)
  266. } else if (character == U'ɒ') { // ɒ (594)
  267. return U'Ɒ'; // Ɒ (11376)
  268. } else if (character == U'ɓ') { // ɓ (595)
  269. return U'Ɓ'; // Ɓ (385)
  270. } else if (character == U'ɔ') { // ɔ (596)
  271. return U'Ɔ'; // Ɔ (390)
  272. } else if (U'ɖ' <= character && character <= U'ɗ') { // ɖ (598) to ɗ (599)
  273. return character - 205;// Ɖ (393) to Ɗ (394)
  274. } else if (U'ɘ' <= character && character <= U'ə') { // ɘ (600) to ə (601)
  275. return character - 202;// Ǝ (398) to Ə (399)
  276. } else if (character == U'ɛ') { // ɛ (603)
  277. return U'Ɛ'; // Ɛ (400)
  278. } else if (character == U'ɠ') { // ɠ (608)
  279. return U'Ɠ'; // Ɠ (403)
  280. } else if (character == U'ɣ') { // ɣ (611)
  281. return U'Ɣ'; // Ɣ (404)
  282. } else if (character == U'ɥ') { // ɥ (613)
  283. return U'Ɥ'; // Ɥ (42893)
  284. } else if (character == U'ɨ') { // ɨ (616)
  285. return U'Ɨ'; // Ɨ (407)
  286. } else if (character == U'ɩ') { // ɩ (617)
  287. return U'Ɩ'; // Ɩ (406)
  288. } else if (character == U'ɪ') { // ɪ (618)
  289. return U'Ɪ'; // Ɪ (42926)
  290. } else if (character == U'ɯ') { // ɯ (623)
  291. return U'Ɯ'; // Ɯ (412)
  292. } else if (character == U'ɱ') { // ɱ (625)
  293. return U'Ɱ'; // Ɱ (11374)
  294. } else if (character == U'ɲ') { // ɲ (626)
  295. return U'Ɲ'; // Ɲ (413)
  296. } else if (character == U'ɵ') { // ɵ (629)
  297. return U'Ɵ'; // Ɵ (415)
  298. } else if (character == U'ɽ') { // ɽ (637)
  299. return U'Ɽ'; // Ɽ (11364)
  300. } else if (character == U'ʀ') { // ʀ (640)
  301. return U'Ʀ'; // Ʀ (422)
  302. } else if (character == U'ʈ') { // ʈ (648)
  303. return U'Ʈ'; // Ʈ (430)
  304. } else if (character == U'ʉ') { // ʉ (649)
  305. return U'Ʉ'; // Ʉ (580)
  306. } else if (U'ʊ' <= character && character <= U'ʋ') { // ʊ (650) to ʋ (651)
  307. return character - 217;// Ʊ (433) to Ʋ (434)
  308. } else if (character == U'ʌ') { // ʌ (652)
  309. return U'Ʌ'; // Ʌ (581)
  310. } else if (character == U'ʒ') { // ʒ (658)
  311. return U'Ʒ'; // Ʒ (439)
  312. } else if (character == U'ʔ') { // ʔ (660)
  313. return U'ˀ'; // ˀ (704)
  314. } else if (character == U'ά') { // ά (940)
  315. return U'Ά'; // Ά (902)
  316. } else if (U'έ' <= character && character <= U'ί') { // έ (941) to ί (943)
  317. return character - 37;// Έ (904) to Ί (906)
  318. } else if (U'α' <= character && character <= U'ρ') { // α (945) to ρ (961)
  319. return character - 32;// Α (913) to Ρ (929)
  320. } else if (U'σ' <= character && character <= U'ϋ') { // σ (963) to ϋ (971)
  321. return character - 32;// Σ (931) to Ϋ (939)
  322. } else if (character == U'ό') { // ό (972)
  323. return U'Ό'; // Ό (908)
  324. } else if (U'ύ' <= character && character <= U'ώ') { // ύ (973) to ώ (974)
  325. return character - 63;// Ύ (910) to Ώ (911)
  326. } else if (U'ϣ' <= character && character <= U'ϯ' && odd) { // ϣ (995) to ϯ (1007)
  327. return character - 1;// Ϣ (994) to Ϯ (1006)
  328. } else if (U'а' <= character && character <= U'я') { // а (1072) to я (1103)
  329. return character - 32;// А (1040) to Я (1071)
  330. } else if (U'ё' <= character && character <= U'ќ') { // ё (1105) to ќ (1116)
  331. return character - 80;// Ё (1025) to Ќ (1036)
  332. } else if (U'ў' <= character && character <= U'џ') { // ў (1118) to џ (1119)
  333. return character - 80;// Ў (1038) to Џ (1039)
  334. } else if (U'ѡ' <= character && character <= U'ҁ' && odd) { // ѡ (1121) to ҁ (1153)
  335. return character - 1;// Ѡ (1120) to Ҁ (1152)
  336. } else if (U'ґ' <= character && character <= U'ҿ' && odd) { // ґ (1169) to ҿ (1215)
  337. return character - 1;// Ґ (1168) to Ҿ (1214)
  338. } else if (U'ӂ' <= character && character <= U'ӄ' && even) { // ӂ (1218) to ӄ (1220)
  339. return character - 1;// Ӂ (1217) to Ӄ (1219)
  340. } else if (character == U'ӈ') { // ӈ (1224)
  341. return U'Ӈ'; // Ӈ (1223)
  342. } else if (character == U'ӌ') { // ӌ (1228)
  343. return U'Ӌ'; // Ӌ (1227)
  344. } else if (U'ӑ' <= character && character <= U'ӫ' && odd) { // ӑ (1233) to ӫ (1259)
  345. return character - 1;// Ӑ (1232) to Ӫ (1258)
  346. } else if (U'ӯ' <= character && character <= U'ӵ' && odd) { // ӯ (1263) to ӵ (1269)
  347. return character - 1;// Ӯ (1262) to Ӵ (1268)
  348. } else if (character == U'ӹ') { // ӹ (1273)
  349. return U'Ӹ'; // Ӹ (1272)
  350. } else if (U'ա' <= character && character <= U'ֆ') { // ա (1377) to ֆ (1414)
  351. return character - 48;// Ա (1329) to Ֆ (1366)
  352. } else if (U'ა' <= character && character <= U'ჵ') { // ა (4304) to ჵ (4341)
  353. return character - 48;// Ⴀ (4256) to Ⴥ (4293)
  354. } else if (U'ḁ' <= character && character <= U'ẕ' && odd) { // ḁ (7681) to ẕ (7829)
  355. return character - 1;// Ḁ (7680) to Ẕ (7828)
  356. } else if (U'ạ' <= character && character <= U'ỹ' && odd) { // ạ (7841) to ỹ (7929)
  357. return character - 1;// Ạ (7840) to Ỹ (7928)
  358. } else if (U'ἀ' <= character && character <= U'ἇ') { // ἀ (7936) to ἇ (7943)
  359. return character + 8;// Ἀ (7944) to Ἇ (7951)
  360. } else if (U'ἐ' <= character && character <= U'ἕ') { // ἐ (7952) to ἕ (7957)
  361. return character + 8;// Ἐ (7960) to Ἕ (7965)
  362. } else if (U'ἠ' <= character && character <= U'ἧ') { // ἠ (7968) to ἧ (7975)
  363. return character + 8;// Ἠ (7976) to Ἧ (7983)
  364. } else if (U'ἰ' <= character && character <= U'ἷ') { // ἰ (7984) to ἷ (7991)
  365. return character + 8;// Ἰ (7992) to Ἷ (7999)
  366. } else if (U'ὀ' <= character && character <= U'ὅ') { // ὀ (8000) to ὅ (8005)
  367. return character + 8;// Ὀ (8008) to Ὅ (8013)
  368. } else if (U'ὑ' <= character && character <= U'ὗ' && odd) { // ὑ (8017) to ὗ (8023)
  369. return character + 8;// Ὑ (8025) to Ὗ (8031)
  370. } else if (U'ὠ' <= character && character <= U'ὧ') { // ὠ (8032) to ὧ (8039)
  371. return character + 8;// Ὠ (8040) to Ὧ (8047)
  372. } else if (U'ᾀ' <= character && character <= U'ᾇ') { // ᾀ (8064) to ᾇ (8071)
  373. return character + 8;// ᾈ (8072) to ᾏ (8079)
  374. } else if (U'ᾐ' <= character && character <= U'ᾗ') { // ᾐ (8080) to ᾗ (8087)
  375. return character + 8;// ᾘ (8088) to ᾟ (8095)
  376. } else if (U'ᾠ' <= character && character <= U'ᾧ') { // ᾠ (8096) to ᾧ (8103)
  377. return character + 8;// ᾨ (8104) to ᾯ (8111)
  378. } else if (U'ᾰ' <= character && character <= U'ᾱ') { // ᾰ (8112) to ᾱ (8113)
  379. return character + 8;// Ᾰ (8120) to Ᾱ (8121)
  380. } else if (U'ῐ' <= character && character <= U'ῑ') { // ῐ (8144) to ῑ (8145)
  381. return character + 8;// Ῐ (8152) to Ῑ (8153)
  382. } else if (U'ῠ' <= character && character <= U'ῡ') { // ῠ (8160) to ῡ (8161)
  383. return character + 8;// Ῠ (8168) to Ῡ (8169)
  384. } else if (U'ⓐ' <= character && character <= U'ⓩ') { // ⓐ (9424) to ⓩ (9449)
  385. return character - 26;// Ⓐ (9398) to Ⓩ (9423)
  386. } else if (U'a' <= character && character <= U'z') { // a (65345) to z (65370)
  387. return character - 32;// A (65313) to Z (65338)
  388. } else {
  389. return character;
  390. }
  391. }
  392. }
  393. DsrChar dsr::character_lowerCase(DsrChar character) {
  394. if (character < 256) {
  395. if (U'A' <= character && character <= U'Z') { // A (65) to Z (90)
  396. return character + 32; // a (97) to z (122)
  397. } else if (U'À' <= character && character <= U'Ö') { // À (192) to Ö (214)
  398. return character + 32; // à (224) to ö (246)
  399. } else if (U'Ø' <= character && character <= U'Þ') { // Ø (216) to Þ (222)
  400. return character + 32; // ø (248) to þ (254)
  401. } else {
  402. return character;
  403. }
  404. } else {
  405. bool odd = character & DsrChar(1);
  406. bool even = !odd;
  407. if (U'Ā' <= character && character <= U'Ķ' && even) { // Ā (256) to Ķ (310)
  408. return character + 1; // ā (257) to ķ (311)
  409. } else if (U'Ĺ' <= character && character <= U'Ň' && odd) { // Ĺ (313) to Ň (327)
  410. return character + 1; // ĺ (314) to ň (328)
  411. } else if (U'Ŋ' <= character && character <= U'Ŷ' && even) { // Ŋ (330) to Ŷ (374)
  412. return character + 1; // ŋ (331) to ŷ (375)
  413. } else if (character == U'Ÿ') { // Ÿ (376)
  414. return U'ÿ'; // ÿ (255)
  415. } else if (character == U'Ź') { // Ź (377)
  416. return U'ź'; // ź (378)
  417. } else if (character == U'Ż') { // Ż (379)
  418. return U'ż'; // ż (380)
  419. } else if (character == U'Ž') { // Ž (381)
  420. return U'ž'; // ž (382)
  421. } else if (character == U'Ɓ') { // Ɓ (385)
  422. return U'ɓ'; // ɓ (595)
  423. } else if (character == U'Ƃ') { // Ƃ (386)
  424. return U'ƃ'; // ƃ (387)
  425. } else if (character == U'Ƅ') { // Ƅ (388)
  426. return U'ƅ'; // ƅ (389)
  427. } else if (character == U'Ɔ') { // Ɔ (390)
  428. return U'ɔ'; // ɔ (596)
  429. } else if (character == U'Ƈ') { // Ƈ (391)
  430. return U'ƈ'; // ƈ (392)
  431. } else if (character == U'Ɖ') { // Ɖ (393)
  432. return U'ɖ'; // ɖ (598)
  433. } else if (character == U'Ɗ') { // Ɗ (394)
  434. return U'ɗ'; // ɗ (599)
  435. } else if (character == U'Ƌ') { // Ƌ (395)
  436. return U'ƌ'; // ƌ (396)
  437. } else if (character == U'Ǝ') { // Ǝ (398)
  438. return U'ɘ'; // ɘ (600)
  439. } else if (character == U'Ə') { // Ə (399)
  440. return U'ə'; // ə (601)
  441. } else if (character == U'Ɛ') { // Ɛ (400)
  442. return U'ɛ'; // ɛ (603)
  443. } else if (character == U'Ƒ') { // Ƒ (401)
  444. return U'ƒ'; // ƒ (402)
  445. } else if (character == U'Ɠ') { // Ɠ (403)
  446. return U'ɠ'; // ɠ (608)
  447. } else if (character == U'Ɣ') { // Ɣ (404)
  448. return U'ɣ'; // ɣ (611)
  449. } else if (character == U'Ɩ') { // Ɩ (406)
  450. return U'ɩ'; // ɩ (617)
  451. } else if (character == U'Ɨ') { // Ɨ (407)
  452. return U'ɨ'; // ɨ (616)
  453. } else if (character == U'Ƙ') { // Ƙ (408)
  454. return U'ƙ'; // ƙ (409)
  455. } else if (character == U'Ɯ') { // Ɯ (412)
  456. return U'ɯ'; // ɯ (623)
  457. } else if (character == U'Ɲ') { // Ɲ (413)
  458. return U'ɲ'; // ɲ (626)
  459. } else if (character == U'Ɵ') { // Ɵ (415)
  460. return U'ɵ'; // ɵ (629)
  461. } else if (character == U'Ơ') { // Ơ (416)
  462. return U'ơ'; // ơ (417)
  463. } else if (character == U'Ƣ') { // Ƣ (418)
  464. return U'ƣ'; // ƣ (419)
  465. } else if (character == U'Ƥ') { // Ƥ (420)
  466. return U'ƥ'; // ƥ (421)
  467. } else if (character == U'Ʀ') { // Ʀ (422)
  468. return U'ʀ'; // ʀ (640)
  469. } else if (character == U'Ƨ') { // Ƨ (423)
  470. return U'ƨ'; // ƨ (424)
  471. } else if (character == U'Ƭ') { // Ƭ (428)
  472. return U'ƭ'; // ƭ (429)
  473. } else if (character == U'Ʈ') { // Ʈ (430)
  474. return U'ʈ'; // ʈ (648)
  475. } else if (character == U'Ư') { // Ư (431)
  476. return U'ư'; // ư (432)
  477. } else if (character == U'Ʊ') { // Ʊ (433)
  478. return U'ʊ'; // ʊ (650)
  479. } else if (character == U'Ʋ') { // Ʋ (434)
  480. return U'ʋ'; // ʋ (651)
  481. } else if (character == U'Ƴ') { // Ƴ (435)
  482. return U'ƴ'; // ƴ (436)
  483. } else if (character == U'Ƶ') { // Ƶ (437)
  484. return U'ƶ'; // ƶ (438)
  485. } else if (character == U'Ʒ') { // Ʒ (439)
  486. return U'ʒ'; // ʒ (658)
  487. } else if (character == U'Ƹ') { // Ƹ (440)
  488. return U'ƹ'; // ƹ (441)
  489. } else if (character == U'Ƽ') { // Ƽ (444)
  490. return U'ƽ'; // ƽ (445)
  491. } else if (character == U'DŽ') { // DŽ (452)
  492. return U'dž'; // dž (454)
  493. } else if (character == U'Dž') { // Dž (453)
  494. return U'dž'; // dž (454)
  495. } else if (character == U'LJ') { // LJ (455)
  496. return U'lj'; // lj (457)
  497. } else if (character == U'Lj') { // Lj (456)
  498. return U'lj'; // lj (457)
  499. } else if (character == U'NJ') { // NJ (458)
  500. return U'nj'; // nj (460)
  501. } else if (U'Nj' <= character && character <= U'Ǜ' && odd) { // Nj (459) to Ǜ (475)
  502. return character + 1; // nj (460) to ǜ (476)
  503. } else if (U'Ǟ' <= character && character <= U'Ǯ' && even) { // Ǟ (478) to Ǯ (494)
  504. return character + 1; // ǟ (479) to ǯ (495)
  505. } else if (character == U'DZ') { // DZ (497)
  506. return U'dz'; // dz (499)
  507. } else if (character == U'Dz') { // Dz (498)
  508. return U'dz'; // dz (499)
  509. } else if (character == U'Ǵ') { // Ǵ (500)
  510. return U'ǵ'; // ǵ (501)
  511. } else if (character == U'Ƿ') { // Ƿ (503)
  512. return U'ƿ'; // ƿ (447)
  513. } else if (U'Ǹ' <= character && character <= U'Ȟ' && even) { // Ǹ (504) to Ȟ (542)
  514. return character + 1; // ǹ (505) to ȟ (543)
  515. } else if (character == U'Ƞ') { // Ƞ (544)
  516. return U'ƞ'; // ƞ (414)
  517. } else if (U'Ȣ' <= character && character <= U'Ȳ' && even) { // Ȣ (546) to Ȳ (562)
  518. return character + 1; // ȣ (547) to ȳ (563)
  519. } else if (character == U'Ȼ') { // Ȼ (571)
  520. return U'ȼ'; // ȼ (572)
  521. } else if (character == U'Ƚ') { // Ƚ (573)
  522. return U'ƚ'; // ƚ (410)
  523. } else if (character == U'Ɂ') { // Ɂ (577)
  524. return U'ɂ'; // ɂ (578)
  525. } else if (character == U'Ƀ') { // Ƀ (579)
  526. return U'ƀ'; // ƀ (384)
  527. } else if (character == U'Ʉ') { // Ʉ (580)
  528. return U'ʉ'; // ʉ (649)
  529. } else if (character == U'Ʌ') { // Ʌ (581)
  530. return U'ʌ'; // ʌ (652)
  531. } else if (U'Ɇ' <= character && character <= U'Ɏ' && even) { // Ɇ (582) to Ɏ (590)
  532. return character + 1;// ɇ (583) to ɏ (591)
  533. } else if (character == U'ʃ') { // ʃ (643)
  534. return U'Ʃ'; // Ʃ (425)
  535. } else if (character == U'ˀ') { // ˀ (704)
  536. return U'ʔ'; // ʔ (660)
  537. } else if (character == U'Ά') { // Ά (902)
  538. return U'ά'; // ά (940)
  539. } else if (U'Έ' <= character && character <= U'Ί') { // Έ (904) to Ί (906)
  540. return character + 37;// έ (941) to ί (943)
  541. } else if (character == U'Ό') { // Ό (908)
  542. return U'ό'; // ό (972)
  543. } else if (U'Ύ' <= character && character <= U'Ώ') { // Ύ (910) to Ώ (911)
  544. return character + 63;// ύ (973) to ώ (974)
  545. } else if (U'Α' <= character && character <= U'Ρ') { // Α (913) to Ρ (929)
  546. return character + 32;// α (945) to ρ (961)
  547. } else if (U'Σ' <= character && character <= U'Ϋ') { // Σ (931) to Ϋ (939)
  548. return character + 32;// σ (963) to ϋ (971)
  549. } else if (U'Ϣ' <= character && character <= U'Ϯ' && even) { // Ϣ (994) to Ϯ (1006)
  550. return character + 1;// ϣ (995) to ϯ (1007)
  551. } else if (U'Ё' <= character && character <= U'Ќ') { // Ё (1025) to Ќ (1036)
  552. return character + 80;// ё (1105) to ќ (1116)
  553. } else if (U'Ў' <= character && character <= U'Џ') { // Ў (1038) to Џ (1039)
  554. return character + 80;// ў (1118) to џ (1119)
  555. } else if (U'А' <= character && character <= U'Я') { // А (1040) to Я (1071)
  556. return character + 32;// а (1072) to я (1103)
  557. } else if (U'Ѡ' <= character && character <= U'Ҁ' && even) { // Ѡ (1120) to Ҁ (1152)
  558. return character + 1;// ѡ (1121) to ҁ (1153)
  559. } else if (U'Ґ' <= character && character <= U'Ҿ' && even) { // Ґ (1168) to Ҿ (1214)
  560. return character + 1;// ґ (1169) to ҿ (1215)
  561. } else if (U'Ӂ' <= character && character <= U'Ӄ' && odd) { // Ӂ (1217) to Ӄ (1219)
  562. return character + 1;// ӂ (1218) to ӄ (1220)
  563. } else if (character == U'Ӈ') { // Ӈ (1223)
  564. return U'ӈ'; // ӈ (1224)
  565. } else if (character == U'Ӌ') { // Ӌ (1227)
  566. return U'ӌ'; // ӌ (1228)
  567. } else if (U'Ӑ' <= character && character <= U'Ӫ' && even) { // Ӑ (1232) to Ӫ (1258)
  568. return character + 1;// ӑ (1233) to ӫ (1259)
  569. } else if (U'Ӯ' <= character && character <= U'Ӵ' && even) { // Ӯ (1262) to Ӵ (1268)
  570. return character + 1;// ӯ (1263) to ӵ (1269)
  571. } else if (character == U'Ӹ') { // Ӹ (1272)
  572. return U'ӹ'; // ӹ (1273)
  573. } else if (U'Ա' <= character && character <= U'Ֆ') { // Ա (1329) to Ֆ (1366)
  574. return character + 48;// ա (1377) to ֆ (1414)
  575. } else if (U'Ⴀ' <= character && character <= U'Ⴥ') { // Ⴀ (4256) to Ⴥ (4293)
  576. return character + 48;// ა (4304) to ჵ (4341)
  577. } else if (U'Ḁ' <= character && character <= U'Ẕ' && even) { // Ḁ (7680) to Ẕ (7828)
  578. return character + 1;// ḁ (7681) to ẕ (7829)
  579. } else if (character == U'ẞ') { // ẞ (7838)
  580. return U'ß'; // ß (223)
  581. } else if (U'Ạ' <= character && character <= U'Ỹ' && even) { // Ạ (7840) to Ỹ (7928)
  582. return character + 1;// ạ (7841) to ỹ (7929)
  583. } else if (U'Ἀ' <= character && character <= U'Ἇ') { // Ἀ (7944) to Ἇ (7951)
  584. return character - 8;// ἀ (7936) to ἇ (7943)
  585. } else if (U'Ἐ' <= character && character <= U'Ἕ') { // Ἐ (7960) to Ἕ (7965)
  586. return character - 8;// ἐ (7952) to ἕ (7957)
  587. } else if (U'Ἠ' <= character && character <= U'Ἧ') { // Ἠ (7976) to Ἧ (7983)
  588. return character - 8;// ἠ (7968) to ἧ (7975)
  589. } else if (U'Ἰ' <= character && character <= U'Ἷ') { // Ἰ (7992) to Ἷ (7999)
  590. return character - 8;// ἰ (7984) to ἷ (7991)
  591. } else if (U'Ὀ' <= character && character <= U'Ὅ') { // Ὀ (8008) to Ὅ (8013)
  592. return character - 8;// ὀ (8000) to ὅ (8005)
  593. } else if (U'Ὑ' <= character && character <= U'Ὗ' && odd) { // Ὑ (8025) to Ὗ (8031)
  594. return character - 8;// ὑ (8017) to ὗ (8023)
  595. } else if (U'Ὠ' <= character && character <= U'Ὧ') { // Ὠ (8040) to Ὧ (8047)
  596. return character - 8;// ὠ (8032) to ὧ (8039)
  597. } else if (U'ᾈ' <= character && character <= U'ᾏ') { // ᾈ (8072) to ᾏ (8079)
  598. return character - 8;// ᾀ (8064) to ᾇ (8071)
  599. } else if (U'ᾘ' <= character && character <= U'ᾟ') { // ᾘ (8088) to ᾟ (8095)
  600. return character - 8;// ᾐ (8080) to ᾗ (8087)
  601. } else if (U'ᾨ' <= character && character <= U'ᾯ') { // ᾨ (8104) to ᾯ (8111)
  602. return character - 8;// ᾠ (8096) to ᾧ (8103)
  603. } else if (U'Ᾰ' <= character && character <= U'Ᾱ') { // Ᾰ (8120) to Ᾱ (8121)
  604. return character - 8;// ᾰ (8112) to ᾱ (8113)
  605. } else if (U'Ῐ' <= character && character <= U'Ῑ') { // Ῐ (8152) to Ῑ (8153)
  606. return character - 8;// ῐ (8144) to ῑ (8145)
  607. } else if (U'Ῠ' <= character && character <= U'Ῡ') { // Ῠ (8168) to Ῡ (8169)
  608. return character - 8;// ῠ (8160) to ῡ (8161)
  609. } else if (U'Ⓐ' <= character && character <= U'Ⓩ') { // Ⓐ (9398) to Ⓩ (9423)
  610. return character + 26;// ⓐ (9424) to ⓩ (9449)
  611. } else if (character == U'Ɽ') { // Ɽ (11364)
  612. return U'ɽ'; // ɽ (637)
  613. } else if (character == U'Ɑ') { // Ɑ (11373)
  614. return U'ɑ'; // ɑ (593)
  615. } else if (character == U'Ɱ') { // Ɱ (11374)
  616. return U'ɱ'; // ɱ (625)
  617. } else if (character == U'Ɐ') { // Ɐ (11375)
  618. return U'ɐ'; // ɐ (592)
  619. } else if (character == U'Ɒ') { // Ɒ (11376)
  620. return U'ɒ'; // ɒ (594)
  621. } else if (U'Ȿ' <= character && character <= U'Ɀ') { // Ȿ (11390) to Ɀ (11391)
  622. return character - 10815;// ȿ (575) to ɀ (576)
  623. } else if (character == U'Ɥ') { // Ɥ (42893)
  624. return U'ɥ'; // ɥ (613)
  625. } else if (character == U'Ɪ') { // Ɪ (42926)
  626. return U'ɪ'; // ɪ (618)
  627. } else if (U'A' <= character && character <= U'Z') { // A (65313) to Z (65338)
  628. return character + 32;// a (65345) to z (65370)
  629. } else {
  630. return character;
  631. }
  632. }
  633. }
  634. String dsr::string_upperCase(const ReadableString &text) {
  635. String result;
  636. string_reserve(result, text.view.length);
  637. for (intptr_t i = 0; i < text.view.length; i++) {
  638. string_appendChar(result, character_upperCase(text[i]));
  639. }
  640. return result;
  641. }
  642. String dsr::string_lowerCase(const ReadableString &text) {
  643. String result;
  644. string_reserve(result, text.view.length);
  645. for (intptr_t i = 0; i < text.view.length; i++) {
  646. string_appendChar(result, character_lowerCase(text[i]));
  647. }
  648. return result;
  649. }
  650. bool dsr::string_match(const ReadableString& a, const ReadableString& b) {
  651. if (a.view.length != b.view.length) {
  652. return false;
  653. } else {
  654. for (intptr_t i = 0; i < a.view.length; i++) {
  655. if (a[i] != b[i]) {
  656. return false;
  657. }
  658. }
  659. return true;
  660. }
  661. }
  662. bool dsr::string_caseInsensitiveMatch(const ReadableString& a, const ReadableString& b) {
  663. if (a.view.length != b.view.length) {
  664. return false;
  665. } else {
  666. for (intptr_t i = 0; i < a.view.length; i++) {
  667. if (character_upperCase(a[i]) != character_upperCase(b[i])) {
  668. return false;
  669. }
  670. }
  671. return true;
  672. }
  673. }
  674. static intptr_t findFirstNonWhite(const ReadableString &text) {
  675. for (intptr_t i = 0; i < text.view.length; i++) {
  676. DsrChar c = text[i];
  677. if (!character_isWhiteSpace(c)) {
  678. return i;
  679. }
  680. }
  681. return -1;
  682. }
  683. static intptr_t findLastNonWhite(const ReadableString &text) {
  684. for (intptr_t i = text.view.length - 1; i >= 0; i--) {
  685. DsrChar c = text[i];
  686. if (!character_isWhiteSpace(c)) {
  687. return i;
  688. }
  689. }
  690. return -1;
  691. }
  692. // Allow passing literals without allocating heap memory for the result
  693. ReadableString dsr::string_removeOuterWhiteSpace(const ReadableString &text) {
  694. intptr_t first = findFirstNonWhite(text);
  695. intptr_t last = findLastNonWhite(text);
  696. if (first == -1) {
  697. // Only white space
  698. return ReadableString();
  699. } else {
  700. // Subset
  701. return string_inclusiveRange(text, first, last);
  702. }
  703. }
  704. String dsr::string_mangleQuote(const ReadableString &rawText) {
  705. String result;
  706. string_reserve(result, rawText.view.length + 2);
  707. string_appendChar(result, U'\"'); // Begin quote
  708. for (intptr_t i = 0; i < rawText.view.length; i++) {
  709. DsrChar c = rawText[i];
  710. if (c == U'\"') { // Double quote
  711. string_append(result, U"\\\"");
  712. } else if (c == U'\\') { // Backslash
  713. string_append(result, U"\\\\");
  714. } else if (c == U'\a') { // Audible bell
  715. string_append(result, U"\\a");
  716. } else if (c == U'\b') { // Backspace
  717. string_append(result, U"\\b");
  718. } else if (c == U'\f') { // Form feed
  719. string_append(result, U"\\f");
  720. } else if (c == U'\n') { // Line feed
  721. string_append(result, U"\\n");
  722. } else if (c == U'\r') { // Carriage return
  723. string_append(result, U"\\r");
  724. } else if (c == U'\t') { // Horizontal tab
  725. string_append(result, U"\\t");
  726. } else if (c == U'\v') { // Vertical tab
  727. string_append(result, U"\\v");
  728. } else if (c == U'\0') { // Null terminator
  729. string_append(result, U"\\0");
  730. } else {
  731. string_appendChar(result, c);
  732. }
  733. }
  734. string_appendChar(result, U'\"'); // End quote
  735. return result;
  736. }
  737. String dsr::string_unmangleQuote(const ReadableString& mangledText) {
  738. intptr_t firstQuote = string_findFirst(mangledText, '\"');
  739. intptr_t lastQuote = string_findLast(mangledText, '\"');
  740. String result;
  741. if (firstQuote == -1 || lastQuote == -1 || firstQuote == lastQuote) {
  742. throwError(U"Cannot unmangle using string_unmangleQuote without beginning and ending with quote signs!\n", mangledText, U"\n");
  743. } else {
  744. for (intptr_t i = firstQuote + 1; i < lastQuote; i++) {
  745. DsrChar c = mangledText[i];
  746. if (c == U'\\') { // Escape character
  747. DsrChar c2 = mangledText[i + 1];
  748. if (c2 == U'\"') { // Double quote
  749. string_appendChar(result, U'\"');
  750. } else if (c2 == U'\\') { // Back slash
  751. string_appendChar(result, U'\\');
  752. } else if (c2 == U'a') { // Audible bell
  753. string_appendChar(result, U'\a');
  754. } else if (c2 == U'b') { // Backspace
  755. string_appendChar(result, U'\b');
  756. } else if (c2 == U'f') { // Form feed
  757. string_appendChar(result, U'\f');
  758. } else if (c2 == U'n') { // Line feed
  759. string_appendChar(result, U'\n');
  760. } else if (c2 == U'r') { // Carriage return
  761. string_appendChar(result, U'\r');
  762. } else if (c2 == U't') { // Horizontal tab
  763. string_appendChar(result, U'\t');
  764. } else if (c2 == U'v') { // Vertical tab
  765. string_appendChar(result, U'\v');
  766. } else if (c2 == U'0') { // Null terminator
  767. string_appendChar(result, U'\0');
  768. }
  769. i++; // Consume both characters
  770. } else {
  771. // Detect bad input
  772. if (c == U'\"') { // Double quote
  773. throwError(U"Unmangled double quote sign detected in string_unmangleQuote!\n", mangledText, U"\n");
  774. } else if (c == U'\a') { // Audible bell
  775. throwError(U"Unmangled audible bell detected in string_unmangleQuote!\n", mangledText, U"\n");
  776. } else if (c == U'\b') { // Backspace
  777. throwError(U"Unmangled backspace detected in string_unmangleQuote!\n", mangledText, U"\n");
  778. } else if (c == U'\f') { // Form feed
  779. throwError(U"Unmangled form feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  780. } else if (c == U'\n') { // Line feed
  781. throwError(U"Unmangled line feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  782. } else if (c == U'\r') { // Carriage return
  783. throwError(U"Unmangled carriage return detected in string_unmangleQuote!\n", mangledText, U"\n");
  784. } else if (c == U'\0') { // Null terminator
  785. throwError(U"Unmangled null terminator detected in string_unmangleQuote!\n", mangledText, U"\n");
  786. } else {
  787. string_appendChar(result, c);
  788. }
  789. }
  790. }
  791. }
  792. return result;
  793. }
  794. void dsr::string_fromUnsigned(String& target, uint64_t value) {
  795. static const int bufferSize = 20;
  796. DsrChar digits[bufferSize];
  797. int64_t usedSize = 0;
  798. if (value == 0) {
  799. string_appendChar(target, U'0');
  800. } else {
  801. while (usedSize < bufferSize) {
  802. DsrChar digit = U'0' + (value % 10u);
  803. digits[usedSize] = digit;
  804. usedSize++;
  805. value /= 10u;
  806. if (value == 0) {
  807. break;
  808. }
  809. }
  810. while (usedSize > 0) {
  811. usedSize--;
  812. string_appendChar(target, digits[usedSize]);
  813. }
  814. }
  815. }
  816. void dsr::string_fromSigned(String& target, int64_t value, DsrChar negationCharacter) {
  817. if (value >= 0) {
  818. string_fromUnsigned(target, (uint64_t)value);
  819. } else {
  820. string_appendChar(target, negationCharacter);
  821. string_fromUnsigned(target, (uint64_t)(-value));
  822. }
  823. }
  824. static const int MAX_DECIMALS = 16;
  825. static double decimalMultipliers[MAX_DECIMALS] = {
  826. 10.0,
  827. 100.0,
  828. 1000.0,
  829. 10000.0,
  830. 100000.0,
  831. 1000000.0,
  832. 10000000.0,
  833. 100000000.0,
  834. 1000000000.0,
  835. 10000000000.0,
  836. 100000000000.0,
  837. 1000000000000.0,
  838. 10000000000000.0,
  839. 100000000000000.0,
  840. 1000000000000000.0,
  841. 10000000000000000.0
  842. };
  843. static double roundingOffsets[MAX_DECIMALS] = {
  844. 0.05,
  845. 0.005,
  846. 0.0005,
  847. 0.00005,
  848. 0.000005,
  849. 0.0000005,
  850. 0.00000005,
  851. 0.000000005,
  852. 0.0000000005,
  853. 0.00000000005,
  854. 0.000000000005,
  855. 0.0000000000005,
  856. 0.00000000000005,
  857. 0.000000000000005,
  858. 0.0000000000000005,
  859. 0.00000000000000005
  860. };
  861. static uint64_t decimalLimits[MAX_DECIMALS] = {
  862. 9,
  863. 99,
  864. 999,
  865. 9999,
  866. 99999,
  867. 999999,
  868. 9999999,
  869. 99999999,
  870. 999999999,
  871. 9999999999,
  872. 99999999999,
  873. 999999999999,
  874. 9999999999999,
  875. 99999999999999,
  876. 999999999999999,
  877. 9999999999999999
  878. };
  879. void dsr::string_fromDouble(String& target, double value, int decimalCount, bool removeTrailingZeroes, DsrChar decimalCharacter, DsrChar negationCharacter) {
  880. if (decimalCount < 1) decimalCount = 1;
  881. if (decimalCount > MAX_DECIMALS) decimalCount = MAX_DECIMALS;
  882. double remainder = value;
  883. // Get negation
  884. if (remainder < 0.0) {
  885. string_appendChar(target, negationCharacter);
  886. remainder = -remainder;
  887. }
  888. // Apply an offset to make the following truncation round to the closest printable decimal.
  889. int offsetIndex = decimalCount - 1;
  890. remainder += roundingOffsets[offsetIndex];
  891. // Get whole part
  892. uint64_t whole = (uint64_t)remainder;
  893. string_fromUnsigned(target, whole);
  894. // Remove the whole part from the remainder.
  895. remainder = remainder - whole;
  896. // Print the decimal
  897. string_appendChar(target, decimalCharacter);
  898. // Get decimals
  899. uint64_t scaledDecimals = uint64_t(remainder * decimalMultipliers[offsetIndex]);
  900. // Limit decimals to all nines prevent losing a whole unit from fraction overflow.
  901. uint64_t limit = decimalLimits[offsetIndex];
  902. if (scaledDecimals > limit) scaledDecimals = limit;
  903. DsrChar digits[MAX_DECIMALS]; // Using 0 to decimalCount - 1
  904. int writeIndex = decimalCount - 1;
  905. for (int d = 0; d < decimalCount; d++) {
  906. int digit = scaledDecimals % 10;
  907. digits[writeIndex] = U'0' + digit;
  908. scaledDecimals = scaledDecimals / 10;
  909. writeIndex--;
  910. }
  911. if (removeTrailingZeroes) {
  912. // Find the last non-zero decimal, but keep at least one zero.
  913. int lastValue = 0;
  914. for (int d = 0; d < decimalCount; d++) {
  915. if (digits[d] != U'0') lastValue = d;
  916. }
  917. // Print until the last value or the only zero.
  918. for (int d = 0; d <= lastValue; d++) {
  919. string_appendChar(target, digits[d]);
  920. }
  921. } else {
  922. // Print fixed decimals.
  923. for (int d = 0; d < decimalCount; d++) {
  924. string_appendChar(target, digits[d]);
  925. }
  926. }
  927. }
  928. #define TO_RAW_ASCII(TARGET, SOURCE) \
  929. char TARGET[SOURCE.view.length + 1]; \
  930. for (intptr_t i = 0; i < SOURCE.view.length; i++) { \
  931. TARGET[i] = toAscii(SOURCE[i]); \
  932. } \
  933. TARGET[SOURCE.view.length] = '\0';
  934. // A function definition for receiving a stream of bytes
  935. // Instead of using std's messy inheritance
  936. using ByteWriterFunction = std::function<void(uint8_t value)>;
  937. // A function definition for receiving a stream of UTF-32 characters
  938. // Instead of using std's messy inheritance
  939. using UTF32WriterFunction = std::function<void(DsrChar character)>;
  940. // Filter out unwanted characters for improved portability
  941. static void feedCharacter(const UTF32WriterFunction &receiver, DsrChar character) {
  942. if (character != U'\0' && character != U'\r') {
  943. receiver(character);
  944. }
  945. }
  946. // Appends the content of buffer as a BOM-free Latin-1 file into target
  947. // fileLength is ignored when nullTerminated is true
  948. template <bool nullTerminated>
  949. static void feedStringFromFileBuffer_Latin1(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  950. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  951. DsrChar character = (DsrChar)(buffer[i]);
  952. if (nullTerminated && character == 0) { return; }
  953. feedCharacter(receiver, character);
  954. }
  955. }
  956. // Appends the content of buffer as a BOM-free UTF-8 file into target
  957. // fileLength is ignored when nullTerminated is true
  958. template <bool nullTerminated>
  959. static void feedStringFromFileBuffer_UTF8(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  960. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  961. uint8_t byteA = buffer[i];
  962. if (byteA < (uint32_t)0b10000000) {
  963. // Single byte (1xxxxxxx)
  964. if (nullTerminated && byteA == 0) { return; }
  965. feedCharacter(receiver, (DsrChar)byteA);
  966. } else {
  967. uint32_t character = 0;
  968. int extraBytes = 0;
  969. if (byteA >= (uint32_t)0b11000000) { // At least two leading ones
  970. if (byteA < (uint32_t)0b11100000) { // Less than three leading ones
  971. character = byteA & (uint32_t)0b00011111;
  972. extraBytes = 1;
  973. } else if (byteA < (uint32_t)0b11110000) { // Less than four leading ones
  974. character = byteA & (uint32_t)0b00001111;
  975. extraBytes = 2;
  976. } else if (byteA < (uint32_t)0b11111000) { // Less than five leading ones
  977. character = byteA & (uint32_t)0b00000111;
  978. extraBytes = 3;
  979. } else {
  980. // Invalid UTF-8 format
  981. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b111111xx!");
  982. }
  983. } else {
  984. // Invalid UTF-8 format
  985. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b10xxxxxx!");
  986. }
  987. while (extraBytes > 0) {
  988. i += 1; uint32_t nextByte = buffer[i];
  989. character = (character << 6) | (nextByte & 0b00111111);
  990. extraBytes--;
  991. }
  992. feedCharacter(receiver, (DsrChar)character);
  993. }
  994. }
  995. }
  996. template <bool LittleEndian>
  997. uint16_t read16bits(const uint8_t* buffer, intptr_t startOffset) {
  998. uint16_t byteA = buffer[startOffset];
  999. uint16_t byteB = buffer[startOffset + 1];
  1000. if (LittleEndian) {
  1001. return (byteB << 8) | byteA;
  1002. } else {
  1003. return (byteA << 8) | byteB;
  1004. }
  1005. }
  1006. // Appends the content of buffer as a BOM-free UTF-16 file into target as UTF-32
  1007. // fileLength is ignored when nullTerminated is true
  1008. template <bool LittleEndian, bool nullTerminated>
  1009. static void feedStringFromFileBuffer_UTF16(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  1010. for (intptr_t i = 0; i < fileLength || nullTerminated; i += 2) {
  1011. // Read the first 16-bit word
  1012. uint16_t wordA = read16bits<LittleEndian>(buffer, i);
  1013. // Check if another word is needed
  1014. // Assuming that wordA >= 0x0000 and wordA <= 0xFFFF as uint16_t,
  1015. // we can just check if it's within the range reserved for 32-bit encoding
  1016. if (wordA <= 0xD7FF || wordA >= 0xE000) {
  1017. // Not in the reserved range, just a single 16-bit character
  1018. if (nullTerminated && wordA == 0) { return; }
  1019. feedCharacter(receiver, (DsrChar)wordA);
  1020. } else {
  1021. // The given range was reserved and therefore using 32 bits
  1022. i += 2;
  1023. uint16_t wordB = read16bits<LittleEndian>(buffer, i);
  1024. uint32_t higher10Bits = wordA & (uint32_t)0b1111111111;
  1025. uint32_t lower10Bits = wordB & (uint32_t)0b1111111111;
  1026. DsrChar finalChar = (DsrChar)(((higher10Bits << 10) | lower10Bits) + (uint32_t)0x10000);
  1027. feedCharacter(receiver, finalChar);
  1028. }
  1029. }
  1030. }
  1031. // Sends the decoded UTF-32 characters from the encoded buffer into target.
  1032. // The text encoding should be specified using a BOM at the start of buffer, otherwise Latin-1 is assumed.
  1033. static void feedStringFromFileBuffer(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength) {
  1034. // After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
  1035. if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
  1036. feedStringFromFileBuffer_UTF8<false>(receiver, buffer + 3, fileLength - 3);
  1037. } else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
  1038. feedStringFromFileBuffer_UTF16<false, false>(receiver, buffer + 2, fileLength - 2);
  1039. } else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
  1040. feedStringFromFileBuffer_UTF16<true, false>(receiver, buffer + 2, fileLength - 2);
  1041. } else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
  1042. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  1043. throwError(U"UTF-32 BE format is not yet supported!\n");
  1044. } else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
  1045. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  1046. throwError(U"UTF-32 LE format is not yet supported!\n");
  1047. } else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
  1048. //feedStringFromFileBuffer_UTF1(receiver, buffer + 3, fileLength - 3);
  1049. throwError(U"UTF-1 format is not yet supported!\n");
  1050. } else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
  1051. //feedStringFromFileBuffer_SCSU(receiver, buffer + 3, fileLength - 3);
  1052. throwError(U"SCSU format is not yet supported!\n");
  1053. } else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
  1054. //feedStringFromFileBuffer_BOCU-1(receiver, buffer + 3, fileLength - 3);
  1055. throwError(U"BOCU-1 format is not yet supported!\n");
  1056. } else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
  1057. // Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
  1058. throwError(U"UTF-7 format is not yet supported!\n");
  1059. } else {
  1060. // No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
  1061. feedStringFromFileBuffer_Latin1<false>(receiver, buffer, fileLength);
  1062. }
  1063. }
  1064. // Sends the decoded UTF-32 characters from the encoded null terminated buffer into target.
  1065. // buffer may not contain any BOM, and must be null terminated in the specified encoding.
  1066. static void feedStringFromRawData(const UTF32WriterFunction &receiver, const uint8_t* buffer, CharacterEncoding encoding) {
  1067. if (encoding == CharacterEncoding::Raw_Latin1) {
  1068. feedStringFromFileBuffer_Latin1<true>(receiver, buffer);
  1069. } else if (encoding == CharacterEncoding::BOM_UTF8) {
  1070. feedStringFromFileBuffer_UTF8<true>(receiver, buffer);
  1071. } else if (encoding == CharacterEncoding::BOM_UTF16BE) {
  1072. feedStringFromFileBuffer_UTF16<false, true>(receiver, buffer);
  1073. } else if (encoding == CharacterEncoding::BOM_UTF16LE) {
  1074. feedStringFromFileBuffer_UTF16<true, true>(receiver, buffer);
  1075. } else {
  1076. throwError(U"Unhandled encoding in feedStringFromRawData!\n");
  1077. }
  1078. }
  1079. String dsr::string_dangerous_decodeFromData(const void* data, CharacterEncoding encoding) {
  1080. String result;
  1081. // Measure the size of the result by scanning the content in advance
  1082. intptr_t characterCount = 0;
  1083. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  1084. characterCount++;
  1085. };
  1086. feedStringFromRawData(measurer, (const uint8_t*)data, encoding);
  1087. // Pre-allocate the correct amount of memory based on the simulation
  1088. string_reserve(result, characterCount);
  1089. // Stream output to the result string
  1090. UTF32WriterFunction receiver = [&result](DsrChar character) {
  1091. string_appendChar(result, character);
  1092. };
  1093. feedStringFromRawData(receiver, (const uint8_t*)data, encoding);
  1094. return result;
  1095. }
  1096. String dsr::string_loadFromMemory(Buffer fileContent) {
  1097. String result;
  1098. // Measure the size of the result by scanning the content in advance
  1099. intptr_t characterCount = 0;
  1100. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  1101. characterCount++;
  1102. };
  1103. feedStringFromFileBuffer(measurer, fileContent.getUnsafe(), fileContent.getUsedSize());
  1104. // Pre-allocate the correct amount of memory based on the simulation
  1105. string_reserve(result, characterCount);
  1106. // Stream output to the result string
  1107. UTF32WriterFunction receiver = [&result](DsrChar character) {
  1108. string_appendChar(result, character);
  1109. };
  1110. feedStringFromFileBuffer(receiver, fileContent.getUnsafe(), fileContent.getUsedSize());
  1111. return result;
  1112. }
  1113. // Loads a text file of unknown format
  1114. // Removes carriage-return characters to make processing easy with only line-feed for breaking lines
  1115. String dsr::string_load(const ReadableString& filename, bool mustExist) {
  1116. Buffer encoded = file_loadBuffer(filename, mustExist);
  1117. if (!buffer_exists(encoded)) {
  1118. return String();
  1119. } else {
  1120. return string_loadFromMemory(encoded);
  1121. }
  1122. }
  1123. template <CharacterEncoding characterEncoding>
  1124. static void encodeCharacter(const ByteWriterFunction &receiver, DsrChar character) {
  1125. if (characterEncoding == CharacterEncoding::Raw_Latin1) {
  1126. // Replace any illegal characters with questionmarks
  1127. if (character > 255) { character = U'?'; }
  1128. receiver(character);
  1129. } else if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  1130. // Replace any illegal characters with questionmarks
  1131. if (character > 0x10FFFF) { character = U'?'; }
  1132. if (character < (1 << 7)) {
  1133. // 0xxxxxxx
  1134. receiver(character);
  1135. } else if (character < (1 << 11)) {
  1136. // 110xxxxx 10xxxxxx
  1137. receiver((uint32_t)0b11000000 | ((character & ((uint32_t)0b11111 << 6)) >> 6));
  1138. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1139. } else if (character < (1 << 16)) {
  1140. // 1110xxxx 10xxxxxx 10xxxxxx
  1141. receiver((uint32_t)0b11100000 | ((character & ((uint32_t)0b1111 << 12)) >> 12));
  1142. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  1143. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1144. } else if (character < (1 << 21)) {
  1145. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  1146. receiver((uint32_t)0b11110000 | ((character & ((uint32_t)0b111 << 18)) >> 18));
  1147. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 12)) >> 12));
  1148. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  1149. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1150. }
  1151. } else { // Assuming UTF-16
  1152. if (character > 0x10FFFF) { character = U'?'; }
  1153. if (character <= 0xD7FF || (character >= 0xE000 && character <= 0xFFFF)) {
  1154. // xxxxxxxx xxxxxxxx (Limited range)
  1155. uint32_t higher8Bits = (character & (uint32_t)0b1111111100000000) >> 8;
  1156. uint32_t lower8Bits = character & (uint32_t)0b0000000011111111;
  1157. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1158. receiver(higher8Bits);
  1159. receiver(lower8Bits);
  1160. } else { // Assuming UTF-16 LE
  1161. receiver(lower8Bits);
  1162. receiver(higher8Bits);
  1163. }
  1164. } else if (character >= 0x010000 && character <= 0x10FFFF) {
  1165. // 110110xxxxxxxxxx 110111xxxxxxxxxx
  1166. uint32_t code = character - (uint32_t)0x10000;
  1167. uint32_t byteA = ((code & (uint32_t)0b11000000000000000000) >> 18) | (uint32_t)0b11011000;
  1168. uint32_t byteB = (code & (uint32_t)0b00111111110000000000) >> 10;
  1169. uint32_t byteC = ((code & (uint32_t)0b00000000001100000000) >> 8) | (uint32_t)0b11011100;
  1170. uint32_t byteD = code & (uint32_t)0b00000000000011111111;
  1171. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1172. receiver(byteA);
  1173. receiver(byteB);
  1174. receiver(byteC);
  1175. receiver(byteD);
  1176. } else { // Assuming UTF-16 LE
  1177. receiver(byteB);
  1178. receiver(byteA);
  1179. receiver(byteD);
  1180. receiver(byteC);
  1181. }
  1182. }
  1183. }
  1184. }
  1185. // Template for encoding a whole string
  1186. template <CharacterEncoding characterEncoding, LineEncoding lineEncoding>
  1187. static void encodeText(const ByteWriterFunction &receiver, String content, bool writeBOM, bool writeNullTerminator) {
  1188. if (writeBOM) {
  1189. // Write byte order marks
  1190. if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  1191. receiver(0xEF);
  1192. receiver(0xBB);
  1193. receiver(0xBF);
  1194. } else if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1195. receiver(0xFE);
  1196. receiver(0xFF);
  1197. } else if (characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  1198. receiver(0xFF);
  1199. receiver(0xFE);
  1200. }
  1201. }
  1202. // Write encoded content
  1203. for (intptr_t i = 0; i < string_length(content); i++) {
  1204. DsrChar character = content[i];
  1205. if (character == U'\n') {
  1206. if (lineEncoding == LineEncoding::CrLf) {
  1207. encodeCharacter<characterEncoding>(receiver, U'\r');
  1208. encodeCharacter<characterEncoding>(receiver, U'\n');
  1209. } else { // Assuming that lineEncoding == LineEncoding::Lf
  1210. encodeCharacter<characterEncoding>(receiver, U'\n');
  1211. }
  1212. } else {
  1213. encodeCharacter<characterEncoding>(receiver, character);
  1214. }
  1215. }
  1216. if (writeNullTerminator) {
  1217. // Terminate internal strings with \0 to prevent getting garbage data after unpadded buffers
  1218. if (characterEncoding == CharacterEncoding::BOM_UTF16BE || characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  1219. receiver(0);
  1220. receiver(0);
  1221. } else {
  1222. receiver(0);
  1223. }
  1224. }
  1225. }
  1226. // Macro for converting run-time arguments into template arguments for encodeText
  1227. #define ENCODE_TEXT(RECEIVER, CONTENT, CHAR_ENCODING, LINE_ENCODING, WRITE_BOM, WRITE_NULL_TERMINATOR) \
  1228. if (CHAR_ENCODING == CharacterEncoding::Raw_Latin1) { \
  1229. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1230. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::CrLf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  1231. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1232. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::Lf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  1233. } \
  1234. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF8) { \
  1235. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1236. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1237. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1238. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1239. } \
  1240. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16BE) { \
  1241. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1242. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1243. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1244. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1245. } \
  1246. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16LE) { \
  1247. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1248. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1249. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1250. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1251. } \
  1252. }
  1253. // Encoding to a buffer before saving all at once as a binary file.
  1254. // This tells the operating system how big the file is in advance and prevent the worst case of stalling for minutes!
  1255. bool dsr::string_save(const ReadableString& filename, const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding) {
  1256. Buffer buffer = string_saveToMemory(content, characterEncoding, lineEncoding);
  1257. if (buffer_exists(buffer)) {
  1258. return file_saveBuffer(filename, buffer);
  1259. } else {
  1260. return false;
  1261. }
  1262. }
  1263. Buffer dsr::string_saveToMemory(const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding, bool writeByteOrderMark, bool writeNullTerminator) {
  1264. intptr_t byteCount = 0;
  1265. ByteWriterFunction counter = [&byteCount](uint8_t value) {
  1266. byteCount++;
  1267. };
  1268. ENCODE_TEXT(counter, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  1269. Buffer result = buffer_create(byteCount).setName("Buffer holding an encoded string");
  1270. SafePointer<uint8_t> byteWriter = buffer_getSafeData<uint8_t>(result, "Buffer for string encoding");
  1271. ByteWriterFunction receiver = [&byteWriter](uint8_t value) {
  1272. *byteWriter = value;
  1273. byteWriter += 1;
  1274. };
  1275. ENCODE_TEXT(receiver, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  1276. return result;
  1277. }
  1278. static uintptr_t getStartOffset(const ReadableString &source) {
  1279. // Get the allocation
  1280. const uint8_t* origin = (uint8_t*)(source.characters.getUnsafe());
  1281. const uint8_t* start = (uint8_t*)(source.view.getUnchecked());
  1282. assert(start <= origin);
  1283. // Get the offset from the parent
  1284. return (start - origin) / sizeof(DsrChar);
  1285. }
  1286. #ifdef SAFE_POINTER_CHECKS
  1287. static void serializeCharacterBuffer(PrintCharacter target, void const * const allocation, uintptr_t maxLength) {
  1288. uintptr_t characterCount = heap_getUsedSize(allocation) / sizeof(DsrChar);
  1289. target(U'\"');
  1290. for (uintptr_t c = 0; c < characterCount; c++) {
  1291. if (c == maxLength) {
  1292. target(U'\"');
  1293. target(U'.');
  1294. target(U'.');
  1295. target(U'.');
  1296. return;
  1297. }
  1298. target(((DsrChar *)allocation)[c]);
  1299. }
  1300. target(U'\"');
  1301. }
  1302. #endif
  1303. static Handle<DsrChar> allocateCharacters(intptr_t minimumLength) {
  1304. // Allocate memory.
  1305. Handle<DsrChar> result = handle_createArray<DsrChar>(AllocationInitialization::Uninitialized, minimumLength).setName("String characters");
  1306. #ifdef SAFE_POINTER_CHECKS
  1307. setAllocationSerialization(result.getUnsafe(), &serializeCharacterBuffer);
  1308. #endif
  1309. // Check how much space we got.
  1310. uintptr_t availableSpace = heap_getAllocationSize(result.getUnsafe());
  1311. // Expand to use all available memory in the allocation.
  1312. uintptr_t newSize = heap_setUsedSize(result.getUnsafe(), availableSpace);
  1313. // Clear the memory to zeroes, just to be safe against non-deterministic bugs.
  1314. safeMemorySet(result.getSafe("Cleared String pointer"), 0, newSize);
  1315. return result;
  1316. }
  1317. // Replaces the buffer with a new buffer holding at least minimumLength characters
  1318. // Guarantees that the new buffer is not shared by other strings, so that it may be written to freely
  1319. static void reallocateBuffer(String &target, intptr_t minimumLength, bool preserve) {
  1320. // Holding oldData alive while copying to the new buffer
  1321. Handle<DsrChar> oldBuffer = target.characters; // Kept for reference counting only, do not remove.
  1322. Impl_CharacterView oldData = target.view;
  1323. target.characters = allocateCharacters(minimumLength);
  1324. target.view = Impl_CharacterView(target.characters.getUnsafe(), oldData.length);
  1325. if (preserve && oldData.length > 0) {
  1326. safeMemoryCopy(target.view.getSafe("New characters being copied from an old buffer"), oldData.getSafe("Old characters being copied to a new buffer"), oldData.length * sizeof(DsrChar));
  1327. }
  1328. }
  1329. // Call before writing to the buffer.
  1330. // This hides that Strings share buffers when assigning by value or taking partial strings.
  1331. static void cloneIfNeeded(String &target) {
  1332. // If there is no buffer or the buffer is shared, it needs to allocate its own buffer.
  1333. if (target.characters.isNull() || target.characters.getUseCount() > 1) {
  1334. reallocateBuffer(target, target.view.length, true);
  1335. }
  1336. }
  1337. void dsr::string_clear(String& target) {
  1338. // We we start writing from the beginning, then we must have our own allocation to avoid overwriting the characters in other strings.
  1339. cloneIfNeeded(target);
  1340. target.view.length = 0;
  1341. }
  1342. // The number of DsrChar characters that can be contained in the allocation before reaching the buffer's end
  1343. // This doesn't imply that it's always okay to write to the remaining space, because the buffer may be shared
  1344. static intptr_t getCapacity(const ReadableString &source) {
  1345. if (source.characters.isNotNull()) {
  1346. uintptr_t bufferElements = source.characters.getElementCount();
  1347. // Subtract offset from the buffer size to get the remaining space
  1348. return bufferElements - getStartOffset(source);
  1349. } else {
  1350. return 0;
  1351. }
  1352. }
  1353. static void expand(String &target, intptr_t newLength, bool affectUsedLength) {
  1354. cloneIfNeeded(target);
  1355. if (newLength > target.view.length) {
  1356. if (newLength > getCapacity(target)) {
  1357. reallocateBuffer(target, newLength, true);
  1358. }
  1359. if (affectUsedLength) {
  1360. target.view.length = newLength;
  1361. }
  1362. }
  1363. }
  1364. void dsr::string_reserve(String& target, intptr_t minimumLength) {
  1365. expand(target, minimumLength, false);
  1366. }
  1367. // This macro has to be used because a static template wouldn't be able to inherit access to private methods from the target class.
  1368. // Better to use a macro without type safety in the implementation than to expose yet another template in a global header.
  1369. // Proof that appending to one string doesn't affect another:
  1370. // If it has to reallocate
  1371. // * Then it will have its own buffer without conflicts
  1372. // If it doesn't have to reallocate
  1373. // If it shares the buffer
  1374. // If source is empty
  1375. // * Then no risk of overwriting neighbor strings if we don't write
  1376. // If source isn't empty
  1377. // * Then the buffer will be cloned when the first character is written
  1378. // If it doesn't share the buffer
  1379. // * Then no risk of writing
  1380. #define APPEND(TARGET, SOURCE, LENGTH, MASK) { \
  1381. intptr_t oldLength = (TARGET).view.length; \
  1382. expand((TARGET), oldLength + (intptr_t)(LENGTH), true); \
  1383. for (intptr_t i = 0; i < (intptr_t)(LENGTH); i++) { \
  1384. (TARGET).view.writeCharacter(oldLength + i, ((SOURCE)[i]) & MASK); \
  1385. } \
  1386. }
  1387. // TODO: See if ascii litterals can be checked for values above 127 in compile-time
  1388. static void atomic_append_ascii(String &target, const char* source) { APPEND(target, source, strlen(source), 0xFF); }
  1389. // TODO: Use memcpy when appending input of the same format
  1390. static void atomic_append_readable(String &target, const ReadableString& source) { APPEND(target, source, source.view.length, 0xFFFFFFFF); }
  1391. static void atomic_append_utf32(String &target, const DsrChar* source) { APPEND(target, source, strlen_utf32(source), 0xFFFFFFFF); }
  1392. void dsr::string_appendChar(String& target, DsrChar value) { APPEND(target, &value, 1, 0xFFFFFFFF); }
  1393. String& dsr::impl_toStreamIndented_ascii(String& target, const char *value, const ReadableString& indentation) {
  1394. atomic_append_readable(target, indentation);
  1395. atomic_append_ascii(target, value);
  1396. return target;
  1397. }
  1398. String& dsr::impl_toStreamIndented_utf32(String& target, const char32_t *value, const ReadableString& indentation) {
  1399. atomic_append_readable(target, indentation);
  1400. atomic_append_utf32(target, value);
  1401. return target;
  1402. }
  1403. String& dsr::impl_toStreamIndented_readable(String& target, const ReadableString& value, const ReadableString& indentation) {
  1404. atomic_append_readable(target, indentation);
  1405. atomic_append_readable(target, value);
  1406. return target;
  1407. }
  1408. String& dsr::impl_toStreamIndented_double(String& target, const double &value, const ReadableString& indentation) {
  1409. atomic_append_readable(target, indentation);
  1410. string_fromDouble(target, (double)value);
  1411. return target;
  1412. }
  1413. String& dsr::impl_toStreamIndented_int64(String& target, const int64_t &value, const ReadableString& indentation) {
  1414. atomic_append_readable(target, indentation);
  1415. string_fromSigned(target, value);
  1416. return target;
  1417. }
  1418. String& dsr::impl_toStreamIndented_uint64(String& target, const uint64_t &value, const ReadableString& indentation) {
  1419. atomic_append_readable(target, indentation);
  1420. string_fromUnsigned(target, value);
  1421. return target;
  1422. }
  1423. // The print mutex makes sure that messages from multiple threads don't get mixed up.
  1424. static std::mutex printMutex;
  1425. static std::ostream& toStream(std::ostream& out, const ReadableString &source) {
  1426. for (intptr_t i = 0; i < source.view.length; i++) {
  1427. out.put(toAscii(source.view[i]));
  1428. }
  1429. return out;
  1430. }
  1431. static const std::function<void(const ReadableString &message, MessageType type)> defaultMessageAction = [](const ReadableString &message, MessageType type) {
  1432. if (type == MessageType::Error) {
  1433. #ifdef DSR_HARD_EXIT_ON_ERROR
  1434. // Print the error.
  1435. toStream(std::cerr, message);
  1436. // Free all heap allocations.
  1437. heap_hardExitCleaning();
  1438. // Terminate with a non-zero value to indicate failure.
  1439. std::exit(1);
  1440. #else
  1441. Buffer ascii = string_saveToMemory(message, CharacterEncoding::Raw_Latin1, LineEncoding::CrLf, false, true);
  1442. throw std::runtime_error((char*)ascii.getUnsafe());
  1443. #endif
  1444. } else {
  1445. printMutex.lock();
  1446. toStream(std::cout, message);
  1447. printMutex.unlock();
  1448. }
  1449. };
  1450. static std::function<void(const ReadableString &message, MessageType type)> globalMessageAction = defaultMessageAction;
  1451. void dsr::string_sendMessage(const ReadableString &message, MessageType type) {
  1452. globalMessageAction(message, type);
  1453. }
  1454. void dsr::string_sendMessage_default(const ReadableString &message, MessageType type) {
  1455. defaultMessageAction(message, type);
  1456. }
  1457. void dsr::string_assignMessageHandler(std::function<void(const ReadableString &message, MessageType type)> newHandler) {
  1458. globalMessageAction = newHandler;
  1459. }
  1460. void dsr::string_unassignMessageHandler() {
  1461. globalMessageAction = defaultMessageAction;
  1462. }
  1463. void dsr::string_split_callback(std::function<void(ReadableString separatedText)> action, const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  1464. intptr_t sectionStart = 0;
  1465. for (intptr_t i = 0; i < source.view.length; i++) {
  1466. DsrChar c = source[i];
  1467. if (c == separator) {
  1468. ReadableString element = string_exclusiveRange(source, sectionStart, i);
  1469. if (removeWhiteSpace) {
  1470. action(string_removeOuterWhiteSpace(element));
  1471. } else {
  1472. action(element);
  1473. }
  1474. sectionStart = i + 1;
  1475. }
  1476. }
  1477. if (source.view.length > sectionStart) {
  1478. if (removeWhiteSpace) {
  1479. action(string_removeOuterWhiteSpace(string_exclusiveRange(source, sectionStart, source.view.length)));
  1480. } else {
  1481. action(string_exclusiveRange(source, sectionStart, source.view.length));
  1482. }
  1483. }
  1484. }
  1485. static String createSubString(const Handle<DsrChar> &characters, const Impl_CharacterView &view) {
  1486. String result;
  1487. result.characters = characters;
  1488. result.view = view;
  1489. return result;
  1490. }
  1491. List<String> dsr::string_split(const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  1492. List<String> result;
  1493. if (source.view.length > 0) {
  1494. // Re-use the existing buffer
  1495. String commonBuffer = createSubString(source.characters, source.view);
  1496. // Source is allocated as String
  1497. string_split_callback([&result, removeWhiteSpace](String element) {
  1498. if (removeWhiteSpace) {
  1499. result.push(string_removeOuterWhiteSpace(element));
  1500. } else {
  1501. result.push(element);
  1502. }
  1503. }, commonBuffer, separator, removeWhiteSpace);
  1504. }
  1505. return result;
  1506. }
  1507. intptr_t dsr::string_splitCount(const ReadableString& source, DsrChar separator) {
  1508. intptr_t result = 0;
  1509. string_split_callback([&result](ReadableString element) {
  1510. result++;
  1511. }, source, separator);
  1512. return result;
  1513. }
  1514. int64_t dsr::string_toInteger(const ReadableString& source) {
  1515. int64_t result;
  1516. bool negated;
  1517. result = 0;
  1518. negated = false;
  1519. for (intptr_t i = 0; i < source.view.length; i++) {
  1520. DsrChar c = source[i];
  1521. if (c == '-' || c == '~') {
  1522. negated = !negated;
  1523. } else if (c >= '0' && c <= '9') {
  1524. result = (result * 10) + (int)(c - '0');
  1525. } else if (c == ',' || c == '.') {
  1526. // Truncate any decimals by ignoring them
  1527. break;
  1528. }
  1529. }
  1530. if (negated) {
  1531. return -result;
  1532. } else {
  1533. return result;
  1534. }
  1535. }
  1536. double dsr::string_toDouble(const ReadableString& source) {
  1537. double result;
  1538. bool negated;
  1539. bool reachedDecimal;
  1540. int64_t digitDivider;
  1541. result = 0.0;
  1542. negated = false;
  1543. reachedDecimal = false;
  1544. digitDivider = 1;
  1545. for (intptr_t i = 0; i < source.view.length; i++) {
  1546. DsrChar c = source[i];
  1547. if (c == '-' || c == '~') {
  1548. negated = !negated;
  1549. } else if (c >= '0' && c <= '9') {
  1550. if (reachedDecimal) {
  1551. digitDivider = digitDivider * 10;
  1552. result = result + ((double)(c - '0') / (double)digitDivider);
  1553. } else {
  1554. result = (result * 10) + (double)(c - '0');
  1555. }
  1556. } else if (c == ',' || c == '.') {
  1557. reachedDecimal = true;
  1558. } else if (c == 'e' || c == 'E') {
  1559. // Apply the exponent after 'e'.
  1560. result *= std::pow(10.0, string_toInteger(string_after(source, i)));
  1561. // Skip remaining characters.
  1562. i = source.view.length;
  1563. }
  1564. }
  1565. if (negated) {
  1566. return -result;
  1567. } else {
  1568. return result;
  1569. }
  1570. }
  1571. intptr_t dsr::string_length(const ReadableString& source) {
  1572. return source.view.length;
  1573. }
  1574. intptr_t dsr::string_findFirst(const ReadableString& source, DsrChar toFind, intptr_t startIndex) {
  1575. for (intptr_t i = startIndex; i < source.view.length; i++) {
  1576. if (source[i] == toFind) {
  1577. return i;
  1578. }
  1579. }
  1580. return -1;
  1581. }
  1582. intptr_t dsr::string_findLast(const ReadableString& source, DsrChar toFind) {
  1583. for (intptr_t i = source.view.length - 1; i >= 0; i--) {
  1584. if (source[i] == toFind) {
  1585. return i;
  1586. }
  1587. }
  1588. return -1;
  1589. }
  1590. ReadableString dsr::string_exclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t exclusiveEnd) {
  1591. // Return empty string for each complete miss
  1592. if (inclusiveStart >= source.view.length || exclusiveEnd <= 0) { return ReadableString(); }
  1593. // Automatically clamping to valid range
  1594. if (inclusiveStart < 0) { inclusiveStart = 0; }
  1595. if (exclusiveEnd > source.view.length) { exclusiveEnd = source.view.length; }
  1596. // Return the overlapping interval
  1597. return createSubString(source.characters, Impl_CharacterView(source.view.getUnchecked() + inclusiveStart, exclusiveEnd - inclusiveStart));
  1598. }
  1599. ReadableString dsr::string_inclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t inclusiveEnd) {
  1600. return string_exclusiveRange(source, inclusiveStart, inclusiveEnd + 1);
  1601. }
  1602. ReadableString dsr::string_before(const ReadableString& source, intptr_t exclusiveEnd) {
  1603. return string_exclusiveRange(source, 0, exclusiveEnd);
  1604. }
  1605. ReadableString dsr::string_until(const ReadableString& source, intptr_t inclusiveEnd) {
  1606. return string_inclusiveRange(source, 0, inclusiveEnd);
  1607. }
  1608. ReadableString dsr::string_from(const ReadableString& source, intptr_t inclusiveStart) {
  1609. return string_exclusiveRange(source, inclusiveStart, source.view.length);
  1610. }
  1611. ReadableString dsr::string_after(const ReadableString& source, intptr_t exclusiveStart) {
  1612. return string_from(source, exclusiveStart + 1);
  1613. }
  1614. bool dsr::character_isDigit(DsrChar c) {
  1615. return c >= U'0' && c <= U'9';
  1616. }
  1617. bool dsr::character_isIntegerCharacter(DsrChar c) {
  1618. return c == U'-' || character_isDigit(c);
  1619. }
  1620. bool dsr::character_isValueCharacter(DsrChar c) {
  1621. return c == U'.' || character_isIntegerCharacter(c);
  1622. }
  1623. bool dsr::character_isWhiteSpace(DsrChar c) {
  1624. return c == U' ' || c == U'\t' || c == U'\v' || c == U'\f' || c == U'\n' || c == U'\r';
  1625. }
  1626. // Macros for implementing regular expressions with a greedy approach consuming the first match
  1627. // Optional accepts 0 or 1 occurence
  1628. // Forced accepts 1 occurence
  1629. // Star accepts 0..N occurence
  1630. // Plus accepts 1..N occurence
  1631. #define CHARACTER_OPTIONAL(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; }
  1632. #define CHARACTER_FORCED(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; } else { return false; }
  1633. #define CHARACTER_STAR(CHARACTER) while (source[readIndex] == CHARACTER) { readIndex++; }
  1634. #define CHARACTER_PLUS(CHARACTER) CHARACTER_FORCED(CHARACTER) CHARACTER_STAR(CHARACTER)
  1635. #define PATTERN_OPTIONAL(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1636. #define PATTERN_FORCED(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; } else { return false; }
  1637. #define PATTERN_STAR(PATTERN) while (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1638. #define PATTERN_PLUS(PATTERN) PATTERN_FORCED(PATTERN) PATTERN_STAR(PATTERN)
  1639. // The greedy approach works here, because there's no ambiguity
  1640. bool dsr::string_isInteger(const ReadableString& source, bool allowWhiteSpace) {
  1641. intptr_t readIndex = 0;
  1642. if (allowWhiteSpace) {
  1643. PATTERN_STAR(WhiteSpace);
  1644. }
  1645. CHARACTER_OPTIONAL(U'-');
  1646. // At least one digit required
  1647. PATTERN_PLUS(IntegerCharacter);
  1648. if (allowWhiteSpace) {
  1649. PATTERN_STAR(WhiteSpace);
  1650. }
  1651. return readIndex == source.view.length;
  1652. }
  1653. // To avoid consuming the all digits on Digit* before reaching Digit+ when there is no decimal, whole integers are judged by string_isInteger
  1654. bool dsr::string_isDouble(const ReadableString& source, bool allowWhiteSpace) {
  1655. // Solving the UnsignedDouble <- Digit+ | Digit* '.' Digit+ ambiguity is done easiest by checking if there's a decimal before handling the white-space and negation
  1656. if (string_findFirst(source, U'.') == -1) {
  1657. // No decimal detected
  1658. return string_isInteger(source, allowWhiteSpace);
  1659. } else {
  1660. intptr_t readIndex = 0;
  1661. if (allowWhiteSpace) {
  1662. PATTERN_STAR(WhiteSpace);
  1663. }
  1664. // Double <- UnsignedDouble | '-' UnsignedDouble
  1665. CHARACTER_OPTIONAL(U'-');
  1666. // UnsignedDouble <- Digit* '.' Digit+
  1667. // Any number of integer digits
  1668. PATTERN_STAR(IntegerCharacter);
  1669. // Only dot for decimal
  1670. CHARACTER_FORCED(U'.')
  1671. // At least one decimal digit
  1672. PATTERN_PLUS(IntegerCharacter);
  1673. if (allowWhiteSpace) {
  1674. PATTERN_STAR(WhiteSpace);
  1675. }
  1676. return readIndex == source.view.length;
  1677. }
  1678. }
  1679. uintptr_t dsr::string_getBufferUseCount(const ReadableString& text) {
  1680. return text.characters.getUseCount();
  1681. }