stringAPI.cpp 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Gets access to private members by making them public for the whole module
  24. #define DSR_INTERNAL_ACCESS
  25. #include <iostream>
  26. #include <sstream>
  27. #include <fstream>
  28. #include <streambuf>
  29. #include <thread>
  30. #include <mutex>
  31. #include <stdexcept>
  32. #include <cmath>
  33. #include "stringAPI.h"
  34. #include "../api/fileAPI.h"
  35. #include "../settings.h"
  36. using namespace dsr;
  37. // The print buffer keeps its buffer size from previous printing to avoid reallocating memory every time something is printed.
  38. // It is stored separatelly for each calling thread to avoid conflicts.
  39. static thread_local String printBuffer;
  40. String &dsr::string_getPrintBuffer() {
  41. return printBuffer;
  42. }
  43. static void atomic_append_ascii(String &target, const char* source);
  44. static void atomic_append_readable(String &target, const ReadableString& source);
  45. static void atomic_append_utf32(String &target, const DsrChar* source);
  46. static intptr_t strlen_utf32(const DsrChar *content) {
  47. intptr_t length = 0;
  48. while (content[length] != 0) {
  49. length++;
  50. }
  51. return length;
  52. }
  53. static char toAscii(DsrChar c) {
  54. if (c > 127) {
  55. return '?';
  56. } else {
  57. return c;
  58. }
  59. }
  60. ReadableString::ReadableString(const DsrChar *content)
  61. : view(content, strlen_utf32(content)) {}
  62. String::String() {}
  63. String::String(const char* source) { atomic_append_ascii(*this, source); }
  64. String::String(const DsrChar* source) { atomic_append_utf32(*this, source); }
  65. String& Printable::toStream(String& target) const {
  66. return this->toStreamIndented(target, U"");
  67. }
  68. String Printable::toStringIndented(const ReadableString& indentation) const {
  69. String result;
  70. this->toStreamIndented(result, indentation);
  71. return result;
  72. }
  73. String Printable::toString() const {
  74. return this->toStringIndented(U"");
  75. }
  76. Printable::~Printable() {}
  77. // TODO: Handle ʼn (329) and the remaining Unicode characters after Ÿ (376).
  78. DsrChar dsr::character_upperCase(DsrChar character) {
  79. if (U'a' <= character && character <= U'z') { // a (97) to z (122) Ascii
  80. return character - (U'a' - U'A');
  81. } else if (U'à' <= character && character <= U'ö') { // à (224) to ö (246) Latin-1
  82. return character - (U'à' - U'À');
  83. } else if (U'ø' <= character && character <= U'þ') { // ø (248) to þ (254) Latin-1
  84. return character - (U'ø' - U'Ø');
  85. } else if (character == U'ÿ') { // ÿ (255)
  86. return U'Ÿ'; // Ÿ (376)
  87. } else if (U'Ā' <= character && character <= U'ķ') { // Ā (256) to ķ (311)
  88. return character & ~DsrChar(1);
  89. } else if (U'Ĺ' <= character && character <= U'ň' && !(character & 1)) { // Even from Ĺ (313) to ň (328)
  90. return character - 1;
  91. } else if (U'Ŋ' <= character && character <= U'ŷ') { // Ŋ (330) to ŷ (375)
  92. return character & ~DsrChar(1);
  93. } else {
  94. return character;
  95. }
  96. }
  97. DsrChar dsr::character_lowerCase(DsrChar character) {
  98. if (U'A' <= character && character <= U'Z') { // A (65) to Z (90) Ascii
  99. return character + (U'a' - U'A');
  100. } else if (U'À' <= character && character <= U'Ö') { // À (192) to Ö (214) Latin-1
  101. return character + (U'à' - U'À');
  102. } else if (U'Ø' <= character && character <= U'Þ') { // Ø (216) to Þ (222) Latin-1
  103. return character + (U'ø' - U'Ø');
  104. } else if (character == U'Ÿ') { // Ÿ (376)
  105. return U'ÿ'; // ÿ (255)
  106. } else if (U'Ā' <= character && character <= U'ķ') { // Ā (256) to ķ (311)
  107. return character | DsrChar(1);
  108. } else if (U'Ĺ' <= character && character <= U'ň' && character & 1) { // Odd from Ĺ (313) to ň (328)
  109. return character + 1;
  110. } else if (U'Ŋ' <= character && character <= U'ŷ') { // Ŋ (330) to ŷ (375)
  111. return character | DsrChar(1);
  112. } else {
  113. return character;
  114. }
  115. }
  116. String dsr::string_upperCase(const ReadableString &text) {
  117. String result;
  118. string_reserve(result, text.view.length);
  119. for (intptr_t i = 0; i < text.view.length; i++) {
  120. string_appendChar(result, character_upperCase(text[i]));
  121. }
  122. return result;
  123. }
  124. String dsr::string_lowerCase(const ReadableString &text) {
  125. String result;
  126. string_reserve(result, text.view.length);
  127. for (intptr_t i = 0; i < text.view.length; i++) {
  128. string_appendChar(result, character_lowerCase(text[i]));
  129. }
  130. return result;
  131. }
  132. bool dsr::string_match(const ReadableString& a, const ReadableString& b) {
  133. if (a.view.length != b.view.length) {
  134. return false;
  135. } else {
  136. for (intptr_t i = 0; i < a.view.length; i++) {
  137. if (a[i] != b[i]) {
  138. return false;
  139. }
  140. }
  141. return true;
  142. }
  143. }
  144. bool dsr::string_caseInsensitiveMatch(const ReadableString& a, const ReadableString& b) {
  145. if (a.view.length != b.view.length) {
  146. return false;
  147. } else {
  148. for (intptr_t i = 0; i < a.view.length; i++) {
  149. if (character_upperCase(a[i]) != character_upperCase(b[i])) {
  150. return false;
  151. }
  152. }
  153. return true;
  154. }
  155. }
  156. static intptr_t findFirstNonWhite(const ReadableString &text) {
  157. for (intptr_t i = 0; i < text.view.length; i++) {
  158. DsrChar c = text[i];
  159. if (!character_isWhiteSpace(c)) {
  160. return i;
  161. }
  162. }
  163. return -1;
  164. }
  165. static intptr_t findLastNonWhite(const ReadableString &text) {
  166. for (intptr_t i = text.view.length - 1; i >= 0; i--) {
  167. DsrChar c = text[i];
  168. if (!character_isWhiteSpace(c)) {
  169. return i;
  170. }
  171. }
  172. return -1;
  173. }
  174. // Allow passing literals without allocating heap memory for the result
  175. ReadableString dsr::string_removeOuterWhiteSpace(const ReadableString &text) {
  176. intptr_t first = findFirstNonWhite(text);
  177. intptr_t last = findLastNonWhite(text);
  178. if (first == -1) {
  179. // Only white space
  180. return ReadableString();
  181. } else {
  182. // Subset
  183. return string_inclusiveRange(text, first, last);
  184. }
  185. }
  186. String dsr::string_mangleQuote(const ReadableString &rawText) {
  187. String result;
  188. string_reserve(result, rawText.view.length + 2);
  189. string_appendChar(result, U'\"'); // Begin quote
  190. for (intptr_t i = 0; i < rawText.view.length; i++) {
  191. DsrChar c = rawText[i];
  192. if (c == U'\"') { // Double quote
  193. string_append(result, U"\\\"");
  194. } else if (c == U'\\') { // Backslash
  195. string_append(result, U"\\\\");
  196. } else if (c == U'\a') { // Audible bell
  197. string_append(result, U"\\a");
  198. } else if (c == U'\b') { // Backspace
  199. string_append(result, U"\\b");
  200. } else if (c == U'\f') { // Form feed
  201. string_append(result, U"\\f");
  202. } else if (c == U'\n') { // Line feed
  203. string_append(result, U"\\n");
  204. } else if (c == U'\r') { // Carriage return
  205. string_append(result, U"\\r");
  206. } else if (c == U'\t') { // Horizontal tab
  207. string_append(result, U"\\t");
  208. } else if (c == U'\v') { // Vertical tab
  209. string_append(result, U"\\v");
  210. } else if (c == U'\0') { // Null terminator
  211. string_append(result, U"\\0");
  212. } else {
  213. string_appendChar(result, c);
  214. }
  215. }
  216. string_appendChar(result, U'\"'); // End quote
  217. return result;
  218. }
  219. String dsr::string_unmangleQuote(const ReadableString& mangledText) {
  220. intptr_t firstQuote = string_findFirst(mangledText, '\"');
  221. intptr_t lastQuote = string_findLast(mangledText, '\"');
  222. String result;
  223. if (firstQuote == -1 || lastQuote == -1 || firstQuote == lastQuote) {
  224. throwError(U"Cannot unmangle using string_unmangleQuote without beginning and ending with quote signs!\n", mangledText, U"\n");
  225. } else {
  226. for (intptr_t i = firstQuote + 1; i < lastQuote; i++) {
  227. DsrChar c = mangledText[i];
  228. if (c == U'\\') { // Escape character
  229. DsrChar c2 = mangledText[i + 1];
  230. if (c2 == U'\"') { // Double quote
  231. string_appendChar(result, U'\"');
  232. } else if (c2 == U'\\') { // Back slash
  233. string_appendChar(result, U'\\');
  234. } else if (c2 == U'a') { // Audible bell
  235. string_appendChar(result, U'\a');
  236. } else if (c2 == U'b') { // Backspace
  237. string_appendChar(result, U'\b');
  238. } else if (c2 == U'f') { // Form feed
  239. string_appendChar(result, U'\f');
  240. } else if (c2 == U'n') { // Line feed
  241. string_appendChar(result, U'\n');
  242. } else if (c2 == U'r') { // Carriage return
  243. string_appendChar(result, U'\r');
  244. } else if (c2 == U't') { // Horizontal tab
  245. string_appendChar(result, U'\t');
  246. } else if (c2 == U'v') { // Vertical tab
  247. string_appendChar(result, U'\v');
  248. } else if (c2 == U'0') { // Null terminator
  249. string_appendChar(result, U'\0');
  250. }
  251. i++; // Consume both characters
  252. } else {
  253. // Detect bad input
  254. if (c == U'\"') { // Double quote
  255. throwError(U"Unmangled double quote sign detected in string_unmangleQuote!\n", mangledText, U"\n");
  256. } else if (c == U'\a') { // Audible bell
  257. throwError(U"Unmangled audible bell detected in string_unmangleQuote!\n", mangledText, U"\n");
  258. } else if (c == U'\b') { // Backspace
  259. throwError(U"Unmangled backspace detected in string_unmangleQuote!\n", mangledText, U"\n");
  260. } else if (c == U'\f') { // Form feed
  261. throwError(U"Unmangled form feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  262. } else if (c == U'\n') { // Line feed
  263. throwError(U"Unmangled line feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  264. } else if (c == U'\r') { // Carriage return
  265. throwError(U"Unmangled carriage return detected in string_unmangleQuote!\n", mangledText, U"\n");
  266. } else if (c == U'\0') { // Null terminator
  267. throwError(U"Unmangled null terminator detected in string_unmangleQuote!\n", mangledText, U"\n");
  268. } else {
  269. string_appendChar(result, c);
  270. }
  271. }
  272. }
  273. }
  274. return result;
  275. }
  276. void dsr::string_fromUnsigned(String& target, uint64_t value) {
  277. static const int bufferSize = 20;
  278. DsrChar digits[bufferSize];
  279. int64_t usedSize = 0;
  280. if (value == 0) {
  281. string_appendChar(target, U'0');
  282. } else {
  283. while (usedSize < bufferSize) {
  284. DsrChar digit = U'0' + (value % 10u);
  285. digits[usedSize] = digit;
  286. usedSize++;
  287. value /= 10u;
  288. if (value == 0) {
  289. break;
  290. }
  291. }
  292. while (usedSize > 0) {
  293. usedSize--;
  294. string_appendChar(target, digits[usedSize]);
  295. }
  296. }
  297. }
  298. void dsr::string_fromSigned(String& target, int64_t value, DsrChar negationCharacter) {
  299. if (value >= 0) {
  300. string_fromUnsigned(target, (uint64_t)value);
  301. } else {
  302. string_appendChar(target, negationCharacter);
  303. string_fromUnsigned(target, (uint64_t)(-value));
  304. }
  305. }
  306. static const int MAX_DECIMALS = 16;
  307. static double decimalMultipliers[MAX_DECIMALS] = {
  308. 10.0,
  309. 100.0,
  310. 1000.0,
  311. 10000.0,
  312. 100000.0,
  313. 1000000.0,
  314. 10000000.0,
  315. 100000000.0,
  316. 1000000000.0,
  317. 10000000000.0,
  318. 100000000000.0,
  319. 1000000000000.0,
  320. 10000000000000.0,
  321. 100000000000000.0,
  322. 1000000000000000.0,
  323. 10000000000000000.0
  324. };
  325. static double roundingOffsets[MAX_DECIMALS] = {
  326. 0.05,
  327. 0.005,
  328. 0.0005,
  329. 0.00005,
  330. 0.000005,
  331. 0.0000005,
  332. 0.00000005,
  333. 0.000000005,
  334. 0.0000000005,
  335. 0.00000000005,
  336. 0.000000000005,
  337. 0.0000000000005,
  338. 0.00000000000005,
  339. 0.000000000000005,
  340. 0.0000000000000005,
  341. 0.00000000000000005
  342. };
  343. static uint64_t decimalLimits[MAX_DECIMALS] = {
  344. 9,
  345. 99,
  346. 999,
  347. 9999,
  348. 99999,
  349. 999999,
  350. 9999999,
  351. 99999999,
  352. 999999999,
  353. 9999999999,
  354. 99999999999,
  355. 999999999999,
  356. 9999999999999,
  357. 99999999999999,
  358. 999999999999999,
  359. 9999999999999999
  360. };
  361. void dsr::string_fromDouble(String& target, double value, int decimalCount, bool removeTrailingZeroes, DsrChar decimalCharacter, DsrChar negationCharacter) {
  362. if (decimalCount < 1) decimalCount = 1;
  363. if (decimalCount > MAX_DECIMALS) decimalCount = MAX_DECIMALS;
  364. double remainder = value;
  365. // Get negation
  366. if (remainder < 0.0) {
  367. string_appendChar(target, negationCharacter);
  368. remainder = -remainder;
  369. }
  370. // Apply an offset to make the following truncation round to the closest printable decimal.
  371. int offsetIndex = decimalCount - 1;
  372. remainder += roundingOffsets[offsetIndex];
  373. // Get whole part
  374. uint64_t whole = (uint64_t)remainder;
  375. string_fromUnsigned(target, whole);
  376. // Remove the whole part from the remainder.
  377. remainder = remainder - whole;
  378. // Print the decimal
  379. string_appendChar(target, decimalCharacter);
  380. // Get decimals
  381. uint64_t scaledDecimals = uint64_t(remainder * decimalMultipliers[offsetIndex]);
  382. // Limit decimals to all nines prevent losing a whole unit from fraction overflow.
  383. uint64_t limit = decimalLimits[offsetIndex];
  384. if (scaledDecimals > limit) scaledDecimals = limit;
  385. DsrChar digits[MAX_DECIMALS]; // Using 0 to decimalCount - 1
  386. int writeIndex = decimalCount - 1;
  387. for (int d = 0; d < decimalCount; d++) {
  388. int digit = scaledDecimals % 10;
  389. digits[writeIndex] = U'0' + digit;
  390. scaledDecimals = scaledDecimals / 10;
  391. writeIndex--;
  392. }
  393. if (removeTrailingZeroes) {
  394. // Find the last non-zero decimal, but keep at least one zero.
  395. int lastValue = 0;
  396. for (int d = 0; d < decimalCount; d++) {
  397. if (digits[d] != U'0') lastValue = d;
  398. }
  399. // Print until the last value or the only zero.
  400. for (int d = 0; d <= lastValue; d++) {
  401. string_appendChar(target, digits[d]);
  402. }
  403. } else {
  404. // Print fixed decimals.
  405. for (int d = 0; d < decimalCount; d++) {
  406. string_appendChar(target, digits[d]);
  407. }
  408. }
  409. }
  410. #define TO_RAW_ASCII(TARGET, SOURCE) \
  411. char TARGET[SOURCE.view.length + 1]; \
  412. for (intptr_t i = 0; i < SOURCE.view.length; i++) { \
  413. TARGET[i] = toAscii(SOURCE[i]); \
  414. } \
  415. TARGET[SOURCE.view.length] = '\0';
  416. // A function definition for receiving a stream of bytes
  417. // Instead of using std's messy inheritance
  418. using ByteWriterFunction = std::function<void(uint8_t value)>;
  419. // A function definition for receiving a stream of UTF-32 characters
  420. // Instead of using std's messy inheritance
  421. using UTF32WriterFunction = std::function<void(DsrChar character)>;
  422. // Filter out unwanted characters for improved portability
  423. static void feedCharacter(const UTF32WriterFunction &receiver, DsrChar character) {
  424. if (character != U'\0' && character != U'\r') {
  425. receiver(character);
  426. }
  427. }
  428. // Appends the content of buffer as a BOM-free Latin-1 file into target
  429. // fileLength is ignored when nullTerminated is true
  430. template <bool nullTerminated>
  431. static void feedStringFromFileBuffer_Latin1(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  432. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  433. DsrChar character = (DsrChar)(buffer[i]);
  434. if (nullTerminated && character == 0) { return; }
  435. feedCharacter(receiver, character);
  436. }
  437. }
  438. // Appends the content of buffer as a BOM-free UTF-8 file into target
  439. // fileLength is ignored when nullTerminated is true
  440. template <bool nullTerminated>
  441. static void feedStringFromFileBuffer_UTF8(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  442. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  443. uint8_t byteA = buffer[i];
  444. if (byteA < (uint32_t)0b10000000) {
  445. // Single byte (1xxxxxxx)
  446. if (nullTerminated && byteA == 0) { return; }
  447. feedCharacter(receiver, (DsrChar)byteA);
  448. } else {
  449. uint32_t character = 0;
  450. int extraBytes = 0;
  451. if (byteA >= (uint32_t)0b11000000) { // At least two leading ones
  452. if (byteA < (uint32_t)0b11100000) { // Less than three leading ones
  453. character = byteA & (uint32_t)0b00011111;
  454. extraBytes = 1;
  455. } else if (byteA < (uint32_t)0b11110000) { // Less than four leading ones
  456. character = byteA & (uint32_t)0b00001111;
  457. extraBytes = 2;
  458. } else if (byteA < (uint32_t)0b11111000) { // Less than five leading ones
  459. character = byteA & (uint32_t)0b00000111;
  460. extraBytes = 3;
  461. } else {
  462. // Invalid UTF-8 format
  463. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b111111xx!");
  464. }
  465. } else {
  466. // Invalid UTF-8 format
  467. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b10xxxxxx!");
  468. }
  469. while (extraBytes > 0) {
  470. i += 1; uint32_t nextByte = buffer[i];
  471. character = (character << 6) | (nextByte & 0b00111111);
  472. extraBytes--;
  473. }
  474. feedCharacter(receiver, (DsrChar)character);
  475. }
  476. }
  477. }
  478. template <bool LittleEndian>
  479. uint16_t read16bits(const uint8_t* buffer, intptr_t startOffset) {
  480. uint16_t byteA = buffer[startOffset];
  481. uint16_t byteB = buffer[startOffset + 1];
  482. if (LittleEndian) {
  483. return (byteB << 8) | byteA;
  484. } else {
  485. return (byteA << 8) | byteB;
  486. }
  487. }
  488. // Appends the content of buffer as a BOM-free UTF-16 file into target as UTF-32
  489. // fileLength is ignored when nullTerminated is true
  490. template <bool LittleEndian, bool nullTerminated>
  491. static void feedStringFromFileBuffer_UTF16(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  492. for (intptr_t i = 0; i < fileLength || nullTerminated; i += 2) {
  493. // Read the first 16-bit word
  494. uint16_t wordA = read16bits<LittleEndian>(buffer, i);
  495. // Check if another word is needed
  496. // Assuming that wordA >= 0x0000 and wordA <= 0xFFFF as uint16_t,
  497. // we can just check if it's within the range reserved for 32-bit encoding
  498. if (wordA <= 0xD7FF || wordA >= 0xE000) {
  499. // Not in the reserved range, just a single 16-bit character
  500. if (nullTerminated && wordA == 0) { return; }
  501. feedCharacter(receiver, (DsrChar)wordA);
  502. } else {
  503. // The given range was reserved and therefore using 32 bits
  504. i += 2;
  505. uint16_t wordB = read16bits<LittleEndian>(buffer, i);
  506. uint32_t higher10Bits = wordA & (uint32_t)0b1111111111;
  507. uint32_t lower10Bits = wordB & (uint32_t)0b1111111111;
  508. DsrChar finalChar = (DsrChar)(((higher10Bits << 10) | lower10Bits) + (uint32_t)0x10000);
  509. feedCharacter(receiver, finalChar);
  510. }
  511. }
  512. }
  513. // Sends the decoded UTF-32 characters from the encoded buffer into target.
  514. // The text encoding should be specified using a BOM at the start of buffer, otherwise Latin-1 is assumed.
  515. static void feedStringFromFileBuffer(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength) {
  516. // After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
  517. if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
  518. feedStringFromFileBuffer_UTF8<false>(receiver, buffer + 3, fileLength - 3);
  519. } else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
  520. feedStringFromFileBuffer_UTF16<false, false>(receiver, buffer + 2, fileLength - 2);
  521. } else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
  522. feedStringFromFileBuffer_UTF16<true, false>(receiver, buffer + 2, fileLength - 2);
  523. } else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
  524. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  525. throwError(U"UTF-32 BE format is not yet supported!\n");
  526. } else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
  527. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  528. throwError(U"UTF-32 LE format is not yet supported!\n");
  529. } else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
  530. //feedStringFromFileBuffer_UTF1(receiver, buffer + 3, fileLength - 3);
  531. throwError(U"UTF-1 format is not yet supported!\n");
  532. } else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
  533. //feedStringFromFileBuffer_SCSU(receiver, buffer + 3, fileLength - 3);
  534. throwError(U"SCSU format is not yet supported!\n");
  535. } else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
  536. //feedStringFromFileBuffer_BOCU-1(receiver, buffer + 3, fileLength - 3);
  537. throwError(U"BOCU-1 format is not yet supported!\n");
  538. } else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
  539. // Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
  540. throwError(U"UTF-7 format is not yet supported!\n");
  541. } else {
  542. // No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
  543. feedStringFromFileBuffer_Latin1<false>(receiver, buffer, fileLength);
  544. }
  545. }
  546. // Sends the decoded UTF-32 characters from the encoded null terminated buffer into target.
  547. // buffer may not contain any BOM, and must be null terminated in the specified encoding.
  548. static void feedStringFromRawData(const UTF32WriterFunction &receiver, const uint8_t* buffer, CharacterEncoding encoding) {
  549. if (encoding == CharacterEncoding::Raw_Latin1) {
  550. feedStringFromFileBuffer_Latin1<true>(receiver, buffer);
  551. } else if (encoding == CharacterEncoding::BOM_UTF8) {
  552. feedStringFromFileBuffer_UTF8<true>(receiver, buffer);
  553. } else if (encoding == CharacterEncoding::BOM_UTF16BE) {
  554. feedStringFromFileBuffer_UTF16<false, true>(receiver, buffer);
  555. } else if (encoding == CharacterEncoding::BOM_UTF16LE) {
  556. feedStringFromFileBuffer_UTF16<true, true>(receiver, buffer);
  557. } else {
  558. throwError(U"Unhandled encoding in feedStringFromRawData!\n");
  559. }
  560. }
  561. String dsr::string_dangerous_decodeFromData(const void* data, CharacterEncoding encoding) {
  562. String result;
  563. // Measure the size of the result by scanning the content in advance
  564. intptr_t characterCount = 0;
  565. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  566. characterCount++;
  567. };
  568. feedStringFromRawData(measurer, (const uint8_t*)data, encoding);
  569. // Pre-allocate the correct amount of memory based on the simulation
  570. string_reserve(result, characterCount);
  571. // Stream output to the result string
  572. UTF32WriterFunction receiver = [&result](DsrChar character) {
  573. string_appendChar(result, character);
  574. };
  575. feedStringFromRawData(receiver, (const uint8_t*)data, encoding);
  576. return result;
  577. }
  578. String dsr::string_loadFromMemory(Buffer fileContent) {
  579. String result;
  580. // Measure the size of the result by scanning the content in advance
  581. intptr_t characterCount = 0;
  582. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  583. characterCount++;
  584. };
  585. feedStringFromFileBuffer(measurer, fileContent.getUnsafe(), fileContent.getUsedSize());
  586. // Pre-allocate the correct amount of memory based on the simulation
  587. string_reserve(result, characterCount);
  588. // Stream output to the result string
  589. UTF32WriterFunction receiver = [&result](DsrChar character) {
  590. string_appendChar(result, character);
  591. };
  592. feedStringFromFileBuffer(receiver, fileContent.getUnsafe(), fileContent.getUsedSize());
  593. return result;
  594. }
  595. // Loads a text file of unknown format
  596. // Removes carriage-return characters to make processing easy with only line-feed for breaking lines
  597. String dsr::string_load(const ReadableString& filename, bool mustExist) {
  598. Buffer encoded = file_loadBuffer(filename, mustExist);
  599. if (!buffer_exists(encoded)) {
  600. return String();
  601. } else {
  602. return string_loadFromMemory(encoded);
  603. }
  604. }
  605. template <CharacterEncoding characterEncoding>
  606. static void encodeCharacter(const ByteWriterFunction &receiver, DsrChar character) {
  607. if (characterEncoding == CharacterEncoding::Raw_Latin1) {
  608. // Replace any illegal characters with questionmarks
  609. if (character > 255) { character = U'?'; }
  610. receiver(character);
  611. } else if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  612. // Replace any illegal characters with questionmarks
  613. if (character > 0x10FFFF) { character = U'?'; }
  614. if (character < (1 << 7)) {
  615. // 0xxxxxxx
  616. receiver(character);
  617. } else if (character < (1 << 11)) {
  618. // 110xxxxx 10xxxxxx
  619. receiver((uint32_t)0b11000000 | ((character & ((uint32_t)0b11111 << 6)) >> 6));
  620. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  621. } else if (character < (1 << 16)) {
  622. // 1110xxxx 10xxxxxx 10xxxxxx
  623. receiver((uint32_t)0b11100000 | ((character & ((uint32_t)0b1111 << 12)) >> 12));
  624. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  625. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  626. } else if (character < (1 << 21)) {
  627. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  628. receiver((uint32_t)0b11110000 | ((character & ((uint32_t)0b111 << 18)) >> 18));
  629. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 12)) >> 12));
  630. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  631. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  632. }
  633. } else { // Assuming UTF-16
  634. if (character > 0x10FFFF) { character = U'?'; }
  635. if (character <= 0xD7FF || (character >= 0xE000 && character <= 0xFFFF)) {
  636. // xxxxxxxx xxxxxxxx (Limited range)
  637. uint32_t higher8Bits = (character & (uint32_t)0b1111111100000000) >> 8;
  638. uint32_t lower8Bits = character & (uint32_t)0b0000000011111111;
  639. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  640. receiver(higher8Bits);
  641. receiver(lower8Bits);
  642. } else { // Assuming UTF-16 LE
  643. receiver(lower8Bits);
  644. receiver(higher8Bits);
  645. }
  646. } else if (character >= 0x010000 && character <= 0x10FFFF) {
  647. // 110110xxxxxxxxxx 110111xxxxxxxxxx
  648. uint32_t code = character - (uint32_t)0x10000;
  649. uint32_t byteA = ((code & (uint32_t)0b11000000000000000000) >> 18) | (uint32_t)0b11011000;
  650. uint32_t byteB = (code & (uint32_t)0b00111111110000000000) >> 10;
  651. uint32_t byteC = ((code & (uint32_t)0b00000000001100000000) >> 8) | (uint32_t)0b11011100;
  652. uint32_t byteD = code & (uint32_t)0b00000000000011111111;
  653. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  654. receiver(byteA);
  655. receiver(byteB);
  656. receiver(byteC);
  657. receiver(byteD);
  658. } else { // Assuming UTF-16 LE
  659. receiver(byteB);
  660. receiver(byteA);
  661. receiver(byteD);
  662. receiver(byteC);
  663. }
  664. }
  665. }
  666. }
  667. // Template for encoding a whole string
  668. template <CharacterEncoding characterEncoding, LineEncoding lineEncoding>
  669. static void encodeText(const ByteWriterFunction &receiver, String content, bool writeBOM, bool writeNullTerminator) {
  670. if (writeBOM) {
  671. // Write byte order marks
  672. if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  673. receiver(0xEF);
  674. receiver(0xBB);
  675. receiver(0xBF);
  676. } else if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  677. receiver(0xFE);
  678. receiver(0xFF);
  679. } else if (characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  680. receiver(0xFF);
  681. receiver(0xFE);
  682. }
  683. }
  684. // Write encoded content
  685. for (intptr_t i = 0; i < string_length(content); i++) {
  686. DsrChar character = content[i];
  687. if (character == U'\n') {
  688. if (lineEncoding == LineEncoding::CrLf) {
  689. encodeCharacter<characterEncoding>(receiver, U'\r');
  690. encodeCharacter<characterEncoding>(receiver, U'\n');
  691. } else { // Assuming that lineEncoding == LineEncoding::Lf
  692. encodeCharacter<characterEncoding>(receiver, U'\n');
  693. }
  694. } else {
  695. encodeCharacter<characterEncoding>(receiver, character);
  696. }
  697. }
  698. if (writeNullTerminator) {
  699. // Terminate internal strings with \0 to prevent getting garbage data after unpadded buffers
  700. if (characterEncoding == CharacterEncoding::BOM_UTF16BE || characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  701. receiver(0);
  702. receiver(0);
  703. } else {
  704. receiver(0);
  705. }
  706. }
  707. }
  708. // Macro for converting run-time arguments into template arguments for encodeText
  709. #define ENCODE_TEXT(RECEIVER, CONTENT, CHAR_ENCODING, LINE_ENCODING, WRITE_BOM, WRITE_NULL_TERMINATOR) \
  710. if (CHAR_ENCODING == CharacterEncoding::Raw_Latin1) { \
  711. if (LINE_ENCODING == LineEncoding::CrLf) { \
  712. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::CrLf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  713. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  714. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::Lf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  715. } \
  716. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF8) { \
  717. if (LINE_ENCODING == LineEncoding::CrLf) { \
  718. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  719. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  720. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  721. } \
  722. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16BE) { \
  723. if (LINE_ENCODING == LineEncoding::CrLf) { \
  724. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  725. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  726. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  727. } \
  728. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16LE) { \
  729. if (LINE_ENCODING == LineEncoding::CrLf) { \
  730. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  731. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  732. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  733. } \
  734. }
  735. // Encoding to a buffer before saving all at once as a binary file.
  736. // This tells the operating system how big the file is in advance and prevent the worst case of stalling for minutes!
  737. bool dsr::string_save(const ReadableString& filename, const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding) {
  738. Buffer buffer = string_saveToMemory(content, characterEncoding, lineEncoding);
  739. if (buffer_exists(buffer)) {
  740. return file_saveBuffer(filename, buffer);
  741. } else {
  742. return false;
  743. }
  744. }
  745. Buffer dsr::string_saveToMemory(const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding, bool writeByteOrderMark, bool writeNullTerminator) {
  746. intptr_t byteCount = 0;
  747. ByteWriterFunction counter = [&byteCount](uint8_t value) {
  748. byteCount++;
  749. };
  750. ENCODE_TEXT(counter, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  751. Buffer result = buffer_create(byteCount).setName("Buffer holding an encoded string");
  752. SafePointer<uint8_t> byteWriter = buffer_getSafeData<uint8_t>(result, "Buffer for string encoding");
  753. ByteWriterFunction receiver = [&byteWriter](uint8_t value) {
  754. *byteWriter = value;
  755. byteWriter += 1;
  756. };
  757. ENCODE_TEXT(receiver, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  758. return result;
  759. }
  760. static uintptr_t getStartOffset(const ReadableString &source) {
  761. // Get the allocation
  762. const uint8_t* origin = (uint8_t*)(source.characters.getUnsafe());
  763. const uint8_t* start = (uint8_t*)(source.view.getUnchecked());
  764. assert(start <= origin);
  765. // Get the offset from the parent
  766. return (start - origin) / sizeof(DsrChar);
  767. }
  768. #ifdef SAFE_POINTER_CHECKS
  769. static void serializeCharacterBuffer(PrintCharacter target, void const * const allocation, uintptr_t maxLength) {
  770. uintptr_t characterCount = heap_getUsedSize(allocation) / sizeof(DsrChar);
  771. target(U'\"');
  772. for (uintptr_t c = 0; c < characterCount; c++) {
  773. if (c == maxLength) {
  774. target(U'\"');
  775. target(U'.');
  776. target(U'.');
  777. target(U'.');
  778. return;
  779. }
  780. target(((DsrChar *)allocation)[c]);
  781. }
  782. target(U'\"');
  783. }
  784. #endif
  785. static Handle<DsrChar> allocateCharacters(intptr_t minimumLength) {
  786. // Allocate memory.
  787. Handle<DsrChar> result = handle_createArray<DsrChar>(AllocationInitialization::Uninitialized, minimumLength).setName("String characters");
  788. #ifdef SAFE_POINTER_CHECKS
  789. setAllocationSerialization(result.getUnsafe(), &serializeCharacterBuffer);
  790. #endif
  791. // Check how much space we got.
  792. uintptr_t availableSpace = heap_getAllocationSize(result.getUnsafe());
  793. // Expand to use all available memory in the allocation.
  794. uintptr_t newSize = heap_setUsedSize(result.getUnsafe(), availableSpace);
  795. // Clear the memory to zeroes, just to be safe against non-deterministic bugs.
  796. safeMemorySet(result.getSafe("Cleared String pointer"), 0, newSize);
  797. return result;
  798. }
  799. // Replaces the buffer with a new buffer holding at least minimumLength characters
  800. // Guarantees that the new buffer is not shared by other strings, so that it may be written to freely
  801. static void reallocateBuffer(String &target, intptr_t minimumLength, bool preserve) {
  802. // Holding oldData alive while copying to the new buffer
  803. Handle<DsrChar> oldBuffer = target.characters; // Kept for reference counting only, do not remove.
  804. Impl_CharacterView oldData = target.view;
  805. target.characters = allocateCharacters(minimumLength);
  806. target.view = Impl_CharacterView(target.characters.getUnsafe(), oldData.length);
  807. if (preserve && oldData.length > 0) {
  808. safeMemoryCopy(target.view.getSafe("New characters being copied from an old buffer"), oldData.getSafe("Old characters being copied to a new buffer"), oldData.length * sizeof(DsrChar));
  809. }
  810. }
  811. // Call before writing to the buffer.
  812. // This hides that Strings share buffers when assigning by value or taking partial strings.
  813. static void cloneIfNeeded(String &target) {
  814. // If there is no buffer or the buffer is shared, it needs to allocate its own buffer.
  815. if (target.characters.isNull() || target.characters.getUseCount() > 1) {
  816. reallocateBuffer(target, target.view.length, true);
  817. }
  818. }
  819. void dsr::string_clear(String& target) {
  820. // We we start writing from the beginning, then we must have our own allocation to avoid overwriting the characters in other strings.
  821. cloneIfNeeded(target);
  822. target.view.length = 0;
  823. }
  824. // The number of DsrChar characters that can be contained in the allocation before reaching the buffer's end
  825. // This doesn't imply that it's always okay to write to the remaining space, because the buffer may be shared
  826. static intptr_t getCapacity(const ReadableString &source) {
  827. if (source.characters.isNotNull()) {
  828. uintptr_t bufferElements = source.characters.getElementCount();
  829. // Subtract offset from the buffer size to get the remaining space
  830. return bufferElements - getStartOffset(source);
  831. } else {
  832. return 0;
  833. }
  834. }
  835. static void expand(String &target, intptr_t newLength, bool affectUsedLength) {
  836. cloneIfNeeded(target);
  837. if (newLength > target.view.length) {
  838. if (newLength > getCapacity(target)) {
  839. reallocateBuffer(target, newLength, true);
  840. }
  841. if (affectUsedLength) {
  842. target.view.length = newLength;
  843. }
  844. }
  845. }
  846. void dsr::string_reserve(String& target, intptr_t minimumLength) {
  847. expand(target, minimumLength, false);
  848. }
  849. // This macro has to be used because a static template wouldn't be able to inherit access to private methods from the target class.
  850. // Better to use a macro without type safety in the implementation than to expose yet another template in a global header.
  851. // Proof that appending to one string doesn't affect another:
  852. // If it has to reallocate
  853. // * Then it will have its own buffer without conflicts
  854. // If it doesn't have to reallocate
  855. // If it shares the buffer
  856. // If source is empty
  857. // * Then no risk of overwriting neighbor strings if we don't write
  858. // If source isn't empty
  859. // * Then the buffer will be cloned when the first character is written
  860. // If it doesn't share the buffer
  861. // * Then no risk of writing
  862. #define APPEND(TARGET, SOURCE, LENGTH, MASK) { \
  863. intptr_t oldLength = (TARGET).view.length; \
  864. expand((TARGET), oldLength + (intptr_t)(LENGTH), true); \
  865. for (intptr_t i = 0; i < (intptr_t)(LENGTH); i++) { \
  866. (TARGET).view.writeCharacter(oldLength + i, ((SOURCE)[i]) & MASK); \
  867. } \
  868. }
  869. // TODO: See if ascii litterals can be checked for values above 127 in compile-time
  870. static void atomic_append_ascii(String &target, const char* source) { APPEND(target, source, strlen(source), 0xFF); }
  871. // TODO: Use memcpy when appending input of the same format
  872. static void atomic_append_readable(String &target, const ReadableString& source) { APPEND(target, source, source.view.length, 0xFFFFFFFF); }
  873. static void atomic_append_utf32(String &target, const DsrChar* source) { APPEND(target, source, strlen_utf32(source), 0xFFFFFFFF); }
  874. void dsr::string_appendChar(String& target, DsrChar value) { APPEND(target, &value, 1, 0xFFFFFFFF); }
  875. String& dsr::impl_toStreamIndented_ascii(String& target, const char *value, const ReadableString& indentation) {
  876. atomic_append_readable(target, indentation);
  877. atomic_append_ascii(target, value);
  878. return target;
  879. }
  880. String& dsr::impl_toStreamIndented_utf32(String& target, const char32_t *value, const ReadableString& indentation) {
  881. atomic_append_readable(target, indentation);
  882. atomic_append_utf32(target, value);
  883. return target;
  884. }
  885. String& dsr::impl_toStreamIndented_readable(String& target, const ReadableString& value, const ReadableString& indentation) {
  886. atomic_append_readable(target, indentation);
  887. atomic_append_readable(target, value);
  888. return target;
  889. }
  890. String& dsr::impl_toStreamIndented_double(String& target, const double &value, const ReadableString& indentation) {
  891. atomic_append_readable(target, indentation);
  892. string_fromDouble(target, (double)value);
  893. return target;
  894. }
  895. String& dsr::impl_toStreamIndented_int64(String& target, const int64_t &value, const ReadableString& indentation) {
  896. atomic_append_readable(target, indentation);
  897. string_fromSigned(target, value);
  898. return target;
  899. }
  900. String& dsr::impl_toStreamIndented_uint64(String& target, const uint64_t &value, const ReadableString& indentation) {
  901. atomic_append_readable(target, indentation);
  902. string_fromUnsigned(target, value);
  903. return target;
  904. }
  905. // The print mutex makes sure that messages from multiple threads don't get mixed up.
  906. static std::mutex printMutex;
  907. static std::ostream& toStream(std::ostream& out, const ReadableString &source) {
  908. for (intptr_t i = 0; i < source.view.length; i++) {
  909. out.put(toAscii(source.view[i]));
  910. }
  911. return out;
  912. }
  913. static const std::function<void(const ReadableString &message, MessageType type)> defaultMessageAction = [](const ReadableString &message, MessageType type) {
  914. if (type == MessageType::Error) {
  915. #ifdef DSR_HARD_EXIT_ON_ERROR
  916. // Print the error.
  917. toStream(std::cerr, message);
  918. // Free all heap allocations.
  919. heap_hardExitCleaning();
  920. // Terminate with a non-zero value to indicate failure.
  921. std::exit(1);
  922. #else
  923. Buffer ascii = string_saveToMemory(message, CharacterEncoding::Raw_Latin1, LineEncoding::CrLf, false, true);
  924. throw std::runtime_error((char*)ascii.getUnsafe());
  925. #endif
  926. } else {
  927. printMutex.lock();
  928. toStream(std::cout, message);
  929. printMutex.unlock();
  930. }
  931. };
  932. static std::function<void(const ReadableString &message, MessageType type)> globalMessageAction = defaultMessageAction;
  933. void dsr::string_sendMessage(const ReadableString &message, MessageType type) {
  934. globalMessageAction(message, type);
  935. }
  936. void dsr::string_sendMessage_default(const ReadableString &message, MessageType type) {
  937. defaultMessageAction(message, type);
  938. }
  939. void dsr::string_assignMessageHandler(std::function<void(const ReadableString &message, MessageType type)> newHandler) {
  940. globalMessageAction = newHandler;
  941. }
  942. void dsr::string_unassignMessageHandler() {
  943. globalMessageAction = defaultMessageAction;
  944. }
  945. void dsr::string_split_callback(std::function<void(ReadableString separatedText)> action, const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  946. intptr_t sectionStart = 0;
  947. for (intptr_t i = 0; i < source.view.length; i++) {
  948. DsrChar c = source[i];
  949. if (c == separator) {
  950. ReadableString element = string_exclusiveRange(source, sectionStart, i);
  951. if (removeWhiteSpace) {
  952. action(string_removeOuterWhiteSpace(element));
  953. } else {
  954. action(element);
  955. }
  956. sectionStart = i + 1;
  957. }
  958. }
  959. if (source.view.length > sectionStart) {
  960. if (removeWhiteSpace) {
  961. action(string_removeOuterWhiteSpace(string_exclusiveRange(source, sectionStart, source.view.length)));
  962. } else {
  963. action(string_exclusiveRange(source, sectionStart, source.view.length));
  964. }
  965. }
  966. }
  967. static String createSubString(const Handle<DsrChar> &characters, const Impl_CharacterView &view) {
  968. String result;
  969. result.characters = characters;
  970. result.view = view;
  971. return result;
  972. }
  973. List<String> dsr::string_split(const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  974. List<String> result;
  975. if (source.view.length > 0) {
  976. // Re-use the existing buffer
  977. String commonBuffer = createSubString(source.characters, source.view);
  978. // Source is allocated as String
  979. string_split_callback([&result, removeWhiteSpace](String element) {
  980. if (removeWhiteSpace) {
  981. result.push(string_removeOuterWhiteSpace(element));
  982. } else {
  983. result.push(element);
  984. }
  985. }, commonBuffer, separator, removeWhiteSpace);
  986. }
  987. return result;
  988. }
  989. intptr_t dsr::string_splitCount(const ReadableString& source, DsrChar separator) {
  990. intptr_t result = 0;
  991. string_split_callback([&result](ReadableString element) {
  992. result++;
  993. }, source, separator);
  994. return result;
  995. }
  996. int64_t dsr::string_toInteger(const ReadableString& source) {
  997. int64_t result;
  998. bool negated;
  999. result = 0;
  1000. negated = false;
  1001. for (intptr_t i = 0; i < source.view.length; i++) {
  1002. DsrChar c = source[i];
  1003. if (c == '-' || c == '~') {
  1004. negated = !negated;
  1005. } else if (c >= '0' && c <= '9') {
  1006. result = (result * 10) + (int)(c - '0');
  1007. } else if (c == ',' || c == '.') {
  1008. // Truncate any decimals by ignoring them
  1009. break;
  1010. }
  1011. }
  1012. if (negated) {
  1013. return -result;
  1014. } else {
  1015. return result;
  1016. }
  1017. }
  1018. double dsr::string_toDouble(const ReadableString& source) {
  1019. double result;
  1020. bool negated;
  1021. bool reachedDecimal;
  1022. int64_t digitDivider;
  1023. result = 0.0;
  1024. negated = false;
  1025. reachedDecimal = false;
  1026. digitDivider = 1;
  1027. for (intptr_t i = 0; i < source.view.length; i++) {
  1028. DsrChar c = source[i];
  1029. if (c == '-' || c == '~') {
  1030. negated = !negated;
  1031. } else if (c >= '0' && c <= '9') {
  1032. if (reachedDecimal) {
  1033. digitDivider = digitDivider * 10;
  1034. result = result + ((double)(c - '0') / (double)digitDivider);
  1035. } else {
  1036. result = (result * 10) + (double)(c - '0');
  1037. }
  1038. } else if (c == ',' || c == '.') {
  1039. reachedDecimal = true;
  1040. } else if (c == 'e' || c == 'E') {
  1041. // Apply the exponent after 'e'.
  1042. result *= std::pow(10.0, string_toInteger(string_after(source, i)));
  1043. // Skip remaining characters.
  1044. i = source.view.length;
  1045. }
  1046. }
  1047. if (negated) {
  1048. return -result;
  1049. } else {
  1050. return result;
  1051. }
  1052. }
  1053. intptr_t dsr::string_length(const ReadableString& source) {
  1054. return source.view.length;
  1055. }
  1056. intptr_t dsr::string_findFirst(const ReadableString& source, DsrChar toFind, intptr_t startIndex) {
  1057. for (intptr_t i = startIndex; i < source.view.length; i++) {
  1058. if (source[i] == toFind) {
  1059. return i;
  1060. }
  1061. }
  1062. return -1;
  1063. }
  1064. intptr_t dsr::string_findLast(const ReadableString& source, DsrChar toFind) {
  1065. for (intptr_t i = source.view.length - 1; i >= 0; i--) {
  1066. if (source[i] == toFind) {
  1067. return i;
  1068. }
  1069. }
  1070. return -1;
  1071. }
  1072. ReadableString dsr::string_exclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t exclusiveEnd) {
  1073. // Return empty string for each complete miss
  1074. if (inclusiveStart >= source.view.length || exclusiveEnd <= 0) { return ReadableString(); }
  1075. // Automatically clamping to valid range
  1076. if (inclusiveStart < 0) { inclusiveStart = 0; }
  1077. if (exclusiveEnd > source.view.length) { exclusiveEnd = source.view.length; }
  1078. // Return the overlapping interval
  1079. return createSubString(source.characters, Impl_CharacterView(source.view.getUnchecked() + inclusiveStart, exclusiveEnd - inclusiveStart));
  1080. }
  1081. ReadableString dsr::string_inclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t inclusiveEnd) {
  1082. return string_exclusiveRange(source, inclusiveStart, inclusiveEnd + 1);
  1083. }
  1084. ReadableString dsr::string_before(const ReadableString& source, intptr_t exclusiveEnd) {
  1085. return string_exclusiveRange(source, 0, exclusiveEnd);
  1086. }
  1087. ReadableString dsr::string_until(const ReadableString& source, intptr_t inclusiveEnd) {
  1088. return string_inclusiveRange(source, 0, inclusiveEnd);
  1089. }
  1090. ReadableString dsr::string_from(const ReadableString& source, intptr_t inclusiveStart) {
  1091. return string_exclusiveRange(source, inclusiveStart, source.view.length);
  1092. }
  1093. ReadableString dsr::string_after(const ReadableString& source, intptr_t exclusiveStart) {
  1094. return string_from(source, exclusiveStart + 1);
  1095. }
  1096. bool dsr::character_isDigit(DsrChar c) {
  1097. return c >= U'0' && c <= U'9';
  1098. }
  1099. bool dsr::character_isIntegerCharacter(DsrChar c) {
  1100. return c == U'-' || character_isDigit(c);
  1101. }
  1102. bool dsr::character_isValueCharacter(DsrChar c) {
  1103. return c == U'.' || character_isIntegerCharacter(c);
  1104. }
  1105. bool dsr::character_isWhiteSpace(DsrChar c) {
  1106. return c == U' ' || c == U'\t' || c == U'\v' || c == U'\f' || c == U'\n' || c == U'\r';
  1107. }
  1108. // Macros for implementing regular expressions with a greedy approach consuming the first match
  1109. // Optional accepts 0 or 1 occurence
  1110. // Forced accepts 1 occurence
  1111. // Star accepts 0..N occurence
  1112. // Plus accepts 1..N occurence
  1113. #define CHARACTER_OPTIONAL(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; }
  1114. #define CHARACTER_FORCED(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; } else { return false; }
  1115. #define CHARACTER_STAR(CHARACTER) while (source[readIndex] == CHARACTER) { readIndex++; }
  1116. #define CHARACTER_PLUS(CHARACTER) CHARACTER_FORCED(CHARACTER) CHARACTER_STAR(CHARACTER)
  1117. #define PATTERN_OPTIONAL(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1118. #define PATTERN_FORCED(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; } else { return false; }
  1119. #define PATTERN_STAR(PATTERN) while (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1120. #define PATTERN_PLUS(PATTERN) PATTERN_FORCED(PATTERN) PATTERN_STAR(PATTERN)
  1121. // The greedy approach works here, because there's no ambiguity
  1122. bool dsr::string_isInteger(const ReadableString& source, bool allowWhiteSpace) {
  1123. intptr_t readIndex = 0;
  1124. if (allowWhiteSpace) {
  1125. PATTERN_STAR(WhiteSpace);
  1126. }
  1127. CHARACTER_OPTIONAL(U'-');
  1128. // At least one digit required
  1129. PATTERN_PLUS(IntegerCharacter);
  1130. if (allowWhiteSpace) {
  1131. PATTERN_STAR(WhiteSpace);
  1132. }
  1133. return readIndex == source.view.length;
  1134. }
  1135. // To avoid consuming the all digits on Digit* before reaching Digit+ when there is no decimal, whole integers are judged by string_isInteger
  1136. bool dsr::string_isDouble(const ReadableString& source, bool allowWhiteSpace) {
  1137. // Solving the UnsignedDouble <- Digit+ | Digit* '.' Digit+ ambiguity is done easiest by checking if there's a decimal before handling the white-space and negation
  1138. if (string_findFirst(source, U'.') == -1) {
  1139. // No decimal detected
  1140. return string_isInteger(source, allowWhiteSpace);
  1141. } else {
  1142. intptr_t readIndex = 0;
  1143. if (allowWhiteSpace) {
  1144. PATTERN_STAR(WhiteSpace);
  1145. }
  1146. // Double <- UnsignedDouble | '-' UnsignedDouble
  1147. CHARACTER_OPTIONAL(U'-');
  1148. // UnsignedDouble <- Digit* '.' Digit+
  1149. // Any number of integer digits
  1150. PATTERN_STAR(IntegerCharacter);
  1151. // Only dot for decimal
  1152. CHARACTER_FORCED(U'.')
  1153. // At least one decimal digit
  1154. PATTERN_PLUS(IntegerCharacter);
  1155. if (allowWhiteSpace) {
  1156. PATTERN_STAR(WhiteSpace);
  1157. }
  1158. return readIndex == source.view.length;
  1159. }
  1160. }
  1161. uintptr_t dsr::string_getBufferUseCount(const ReadableString& text) {
  1162. return text.characters.getUseCount();
  1163. }