stringAPI.cpp 72 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Gets access to private members by making them public for the whole module
  24. #define DSR_INTERNAL_ACCESS
  25. #include <iostream>
  26. #include <sstream>
  27. #include <fstream>
  28. #include <streambuf>
  29. #include <thread>
  30. #include <mutex>
  31. #include <stdexcept>
  32. #include <cmath>
  33. #include "stringAPI.h"
  34. #include "../api/fileAPI.h"
  35. #include "../settings.h"
  36. using namespace dsr;
  37. // The print buffer keeps its buffer size from previous printing to avoid reallocating memory every time something is printed.
  38. // It is stored separatelly for each calling thread to avoid conflicts.
  39. static thread_local String printBuffer;
  40. String &dsr::string_getPrintBuffer() {
  41. return printBuffer;
  42. }
  43. static void atomic_append_ascii(String &target, const char* source);
  44. static void atomic_append_readable(String &target, const ReadableString& source);
  45. static void atomic_append_utf32(String &target, const DsrChar* source);
  46. static intptr_t strlen_utf32(const DsrChar *content) {
  47. intptr_t length = 0;
  48. while (content[length] != 0) {
  49. length++;
  50. }
  51. return length;
  52. }
  53. static char toAscii(DsrChar c) {
  54. if (c > 127) {
  55. return '?';
  56. } else {
  57. return c;
  58. }
  59. }
  60. ReadableString::ReadableString(const DsrChar *content)
  61. : view(content, strlen_utf32(content)) {}
  62. String::String() {}
  63. String::String(const char* source) { atomic_append_ascii(*this, source); }
  64. String::String(const DsrChar* source) { atomic_append_utf32(*this, source); }
  65. String& Printable::toStream(String& target) const {
  66. return this->toStreamIndented(target, U"");
  67. }
  68. String Printable::toStringIndented(const ReadableString& indentation) const {
  69. String result;
  70. this->toStreamIndented(result, indentation);
  71. return result;
  72. }
  73. String Printable::toString() const {
  74. return this->toStringIndented(U"");
  75. }
  76. Printable::~Printable() {}
  77. /*
  78. Code generator used to create character transforming functions from arbitrary reference functions.
  79. Paste the result into functions that provide character, odd and even before the returning if statement begins.
  80. static void generateCharacterRange(String &result, DsrChar firstIn, DsrChar lastIn, int64_t stride, int64_t offset) {
  81. DsrChar firstOut = firstIn + offset;
  82. DsrChar lastOut = lastIn + offset;
  83. if (string_length(result) == 0) {
  84. string_append(result, U" ");
  85. } else {
  86. string_append(result, U" } else ");
  87. }
  88. if (firstIn == lastIn) {
  89. string_append(result, U"if (character == U'", firstIn, "') { // ", firstIn, U" (", (uint32_t)firstIn, U")\n");
  90. string_append(result, U" return U'", firstOut, "'; // ", firstOut, U" (", (uint32_t)firstOut, U")\n");
  91. } else {
  92. string_append(result, U"if (U'", firstIn, "' <= character && character <= U'", lastIn, U"'");
  93. if (stride == 2) {
  94. if (firstIn & DsrChar(1)) {
  95. // Odd interval
  96. string_append(result, U" && odd");
  97. } else {
  98. // Even interval
  99. string_append(result, U" && even");
  100. }
  101. } else if (stride != 1) {
  102. throwError(U"Unsupported stride ", stride, U"!\n");
  103. }
  104. string_append(result, U") { // ", firstIn, U" (", (uint32_t)firstIn, U") to ", lastIn, U" (", (uint32_t)lastIn, U")\n");
  105. if (firstOut > firstIn) {
  106. string_append(result, U" return character + ", offset, ";");
  107. } else if (firstOut < firstIn) {
  108. string_append(result, U" return character - ", -offset, ";");
  109. }
  110. string_append(result, U"// ", firstOut, U" (", (uint32_t)firstOut, U") to ", lastOut, U" (", (uint32_t)lastOut, U")\n");
  111. }
  112. }
  113. // Pre-condition: The transform function must change at least one character.
  114. static String generateCharacterMapping(std::function<DsrChar(const DsrChar character)> transform, DsrChar first, DsrChar last) {
  115. String result;
  116. int64_t rangeStart = -1;
  117. int64_t rangeEnd = -1;
  118. int64_t lastOffset = -1;
  119. int64_t currentStride = -1;
  120. for (int64_t c = first; c <= last; c++) {
  121. int64_t t = transform(c);
  122. if (c != t) {
  123. int64_t offset = int64_t(t) - int64_t(c);
  124. int64_t step = c - rangeEnd;
  125. // Check if we should break apart the previous range.
  126. if ((currentStride != -1 && step != currentStride)
  127. || step > 2
  128. || (lastOffset != -1 && offset != lastOffset)) {
  129. if (rangeStart != -1) {
  130. generateCharacterRange(result, rangeStart, rangeEnd, currentStride, lastOffset);
  131. }
  132. rangeStart = c;
  133. rangeEnd = c;
  134. lastOffset = offset;
  135. currentStride = -1;
  136. } else {
  137. rangeEnd = c;
  138. lastOffset = offset;
  139. currentStride = step;
  140. }
  141. }
  142. }
  143. // Generate the last range, while assuming that we have at least one character to modify.
  144. if (rangeStart != -1) {
  145. generateCharacterRange(result, rangeStart, rangeEnd, currentStride, lastOffset);
  146. }
  147. string_append(result, U" } else {\n");
  148. string_append(result, U" return character;\n");
  149. string_append(result, U" }\n");
  150. return result;
  151. }
  152. */
  153. DsrChar dsr::character_upperCase(DsrChar character) {
  154. if (character < 256) {
  155. if (U'a' <= character && character <= U'z') { // a (97) to z (122)
  156. return character - 32; // A (65) to Z (90)
  157. } else if (character == U'ß') { // ß (223)
  158. return U'ẞ'; // ẞ (7838)
  159. } else if (U'à' <= character && character <= U'ö') { // à (224) to ö (246)
  160. return character - 32; // À (192) to Ö (214)
  161. } else if (U'ø' <= character && character <= U'þ') { // ø (248) to þ (254)
  162. return character - 32; // Ø (216) to Þ (222)
  163. } else if (character == U'ÿ') { // ÿ (255)
  164. return U'Ÿ'; // Ÿ (376)
  165. } else {
  166. return character;
  167. }
  168. } else {
  169. bool odd = character & DsrChar(1);
  170. bool even = !odd;
  171. if (U'ā' <= character && character <= U'ķ' && odd) { // ā (257) to ķ (311)
  172. return character - 1; // Ā (256) to Ķ (310)
  173. } else if (U'ĺ' <= character && character <= U'ň' && even) { // ĺ (314) to ň (328)
  174. return character - 1; // Ĺ (313) to Ň (327)
  175. } else if (U'ŋ' <= character && character <= U'ŷ' && odd) { // ŋ (331) to ŷ (375)
  176. return character - 1; // Ŋ (330) toŶ (374)
  177. } else if (U'ź' <= character && character <= U'ž' && even) { // ź (378) to ž (382)
  178. return character - 1; // Ź (377) to Ž (381)
  179. } else if (character == U'ƀ') { // ƀ (384)
  180. return U'Ƀ'; // Ƀ (579)
  181. } else if (character == U'ƃ') { // ƃ (387)
  182. return U'Ƃ'; // Ƃ (386)
  183. } else if (character == U'ƅ') { // ƅ (389)
  184. return U'Ƅ'; // Ƅ (388)
  185. } else if (character == U'ƈ') { // ƈ (392)
  186. return U'Ƈ'; // Ƈ (391)
  187. } else if (character == U'ƌ') { // ƌ (396)
  188. return U'Ƌ'; // Ƌ (395)
  189. } else if (character == U'ƒ') { // ƒ (402)
  190. return U'Ƒ'; // Ƒ (401)
  191. } else if (character == U'ƙ') { // ƙ (409)
  192. return U'Ƙ'; // Ƙ (408)
  193. } else if (character == U'ƚ') { // ƚ (410)
  194. return U'Ƚ'; // Ƚ (573)
  195. } else if (character == U'ƞ') { // ƞ (414)
  196. return U'Ƞ'; // Ƞ (544)
  197. } else if (character == U'ơ') { // ơ (417)
  198. return U'Ơ'; // Ơ (416)
  199. } else if (character == U'ƣ') { // ƣ (419)
  200. return U'Ƣ'; // Ƣ (418)
  201. } else if (character == U'ƥ') { // ƥ (421)
  202. return U'Ƥ'; // Ƥ (420)
  203. } else if (character == U'ƨ') { // ƨ (424)
  204. return U'Ƨ'; // Ƨ (423)
  205. } else if (character == U'Ʃ') { // Ʃ (425)
  206. return U'ʃ'; // ʃ (643)
  207. } else if (character == U'ƭ') { // ƭ (429)
  208. return U'Ƭ'; // Ƭ (428)
  209. } else if (character == U'ư') { // ư (432)
  210. return U'Ư'; // Ư (431)
  211. } else if (character == U'ƴ') { // ƴ (436)
  212. return U'Ƴ'; // Ƴ (435)
  213. } else if (character == U'ƶ') { // ƶ (438)
  214. return U'Ƶ'; // Ƶ (437)
  215. } else if (character == U'ƹ') { // ƹ (441)
  216. return U'Ƹ'; // Ƹ (440)
  217. } else if (character == U'ƽ') { // ƽ (445)
  218. return U'Ƽ'; // Ƽ (444)
  219. } else if (character == U'ƿ') { // ƿ (447)
  220. return U'Ƿ'; // Ƿ (503)
  221. } else if (character == U'Dž') { // Dž (453)
  222. return U'DŽ'; // DŽ (452)
  223. } else if (character == U'dž') { // dž (454)
  224. return U'DŽ'; // DŽ (452)
  225. } else if (character == U'Lj') { // Lj (456)
  226. return U'LJ'; // LJ (455)
  227. } else if (character == U'lj') { // lj (457)
  228. return U'LJ'; // LJ (455)
  229. } else if (character == U'Nj') { // Nj (459)
  230. return U'NJ'; // NJ (458)
  231. } else if (character == U'nj') { // nj (460)
  232. return U'NJ'; // NJ (458)
  233. } else if (U'ǎ' <= character && character <= U'ǜ' && even) { // ǎ (462) to ǜ (476)
  234. return character - 1; // Ǎ (461) to Ǜ (475)
  235. } else if (U'ǟ' <= character && character <= U'ǯ' && odd) { // ǟ (479) to ǯ (495)
  236. return character - 1; // Ǟ (478) to Ǯ (494)
  237. } else if (character == U'Dz') { // Dz (498)
  238. return U'DZ'; // DZ (497)
  239. } else if (character == U'dz') { // dz (499)
  240. return U'DZ'; // DZ (497)
  241. } else if (character == U'ǵ') { // ǵ (501)
  242. return U'Ǵ'; // Ǵ (500)
  243. } else if (U'ǹ' <= character && character <= U'ȟ' && odd) { // ǹ (505) to ȟ (543)
  244. return character - 1;// Ǹ (504) to Ȟ (542)
  245. } else if (U'ȣ' <= character && character <= U'ȳ' && odd) { // ȣ (547) to ȳ (563)
  246. return character - 1;// Ȣ (546) to Ȳ (562)
  247. } else if (character == U'ȼ') { // ȼ (572)
  248. return U'Ȼ'; // Ȼ (571)
  249. } else if (U'ȿ' <= character && character <= U'ɀ') { // ȿ (575) to ɀ (576)
  250. return character + 10815;// Ȿ (11390) to Ɀ (11391)
  251. } else if (character == U'ɂ') { // ɂ (578)
  252. return U'Ɂ'; // Ɂ (577)
  253. } else if (U'ɇ' <= character && character <= U'ɏ' && odd) { // ɇ (583) to ɏ (591)
  254. return character - 1;// Ɇ (582) to Ɏ (590)
  255. } else if (character == U'ɐ') { // ɐ (592)
  256. return U'Ɐ'; // Ɐ (11375)
  257. } else if (character == U'ɑ') { // ɑ (593)
  258. return U'Ɑ'; // Ɑ (11373)
  259. } else if (character == U'ɒ') { // ɒ (594)
  260. return U'Ɒ'; // Ɒ (11376)
  261. } else if (character == U'ɓ') { // ɓ (595)
  262. return U'Ɓ'; // Ɓ (385)
  263. } else if (character == U'ɔ') { // ɔ (596)
  264. return U'Ɔ'; // Ɔ (390)
  265. } else if (U'ɖ' <= character && character <= U'ɗ') { // ɖ (598) to ɗ (599)
  266. return character - 205;// Ɖ (393) to Ɗ (394)
  267. } else if (U'ɘ' <= character && character <= U'ə') { // ɘ (600) to ə (601)
  268. return character - 202;// Ǝ (398) to Ə (399)
  269. } else if (character == U'ɛ') { // ɛ (603)
  270. return U'Ɛ'; // Ɛ (400)
  271. } else if (character == U'ɠ') { // ɠ (608)
  272. return U'Ɠ'; // Ɠ (403)
  273. } else if (character == U'ɣ') { // ɣ (611)
  274. return U'Ɣ'; // Ɣ (404)
  275. } else if (character == U'ɥ') { // ɥ (613)
  276. return U'Ɥ'; // Ɥ (42893)
  277. } else if (character == U'ɨ') { // ɨ (616)
  278. return U'Ɨ'; // Ɨ (407)
  279. } else if (character == U'ɩ') { // ɩ (617)
  280. return U'Ɩ'; // Ɩ (406)
  281. } else if (character == U'ɪ') { // ɪ (618)
  282. return U'Ɪ'; // Ɪ (42926)
  283. } else if (character == U'ɯ') { // ɯ (623)
  284. return U'Ɯ'; // Ɯ (412)
  285. } else if (character == U'ɱ') { // ɱ (625)
  286. return U'Ɱ'; // Ɱ (11374)
  287. } else if (character == U'ɲ') { // ɲ (626)
  288. return U'Ɲ'; // Ɲ (413)
  289. } else if (character == U'ɵ') { // ɵ (629)
  290. return U'Ɵ'; // Ɵ (415)
  291. } else if (character == U'ɽ') { // ɽ (637)
  292. return U'Ɽ'; // Ɽ (11364)
  293. } else if (character == U'ʀ') { // ʀ (640)
  294. return U'Ʀ'; // Ʀ (422)
  295. } else if (character == U'ʈ') { // ʈ (648)
  296. return U'Ʈ'; // Ʈ (430)
  297. } else if (character == U'ʉ') { // ʉ (649)
  298. return U'Ʉ'; // Ʉ (580)
  299. } else if (U'ʊ' <= character && character <= U'ʋ') { // ʊ (650) to ʋ (651)
  300. return character - 217;// Ʊ (433) to Ʋ (434)
  301. } else if (character == U'ʌ') { // ʌ (652)
  302. return U'Ʌ'; // Ʌ (581)
  303. } else if (character == U'ʒ') { // ʒ (658)
  304. return U'Ʒ'; // Ʒ (439)
  305. } else if (character == U'ʔ') { // ʔ (660)
  306. return U'ˀ'; // ˀ (704)
  307. } else if (character == U'ά') { // ά (940)
  308. return U'Ά'; // Ά (902)
  309. } else if (U'έ' <= character && character <= U'ί') { // έ (941) to ί (943)
  310. return character - 37;// Έ (904) to Ί (906)
  311. } else if (U'α' <= character && character <= U'ρ') { // α (945) to ρ (961)
  312. return character - 32;// Α (913) to Ρ (929)
  313. } else if (U'σ' <= character && character <= U'ϋ') { // σ (963) to ϋ (971)
  314. return character - 32;// Σ (931) to Ϋ (939)
  315. } else if (character == U'ό') { // ό (972)
  316. return U'Ό'; // Ό (908)
  317. } else if (U'ύ' <= character && character <= U'ώ') { // ύ (973) to ώ (974)
  318. return character - 63;// Ύ (910) to Ώ (911)
  319. } else if (U'ϣ' <= character && character <= U'ϯ' && odd) { // ϣ (995) to ϯ (1007)
  320. return character - 1;// Ϣ (994) to Ϯ (1006)
  321. } else if (U'а' <= character && character <= U'я') { // а (1072) to я (1103)
  322. return character - 32;// А (1040) to Я (1071)
  323. } else if (U'ё' <= character && character <= U'ќ') { // ё (1105) to ќ (1116)
  324. return character - 80;// Ё (1025) to Ќ (1036)
  325. } else if (U'ў' <= character && character <= U'џ') { // ў (1118) to џ (1119)
  326. return character - 80;// Ў (1038) to Џ (1039)
  327. } else if (U'ѡ' <= character && character <= U'ҁ' && odd) { // ѡ (1121) to ҁ (1153)
  328. return character - 1;// Ѡ (1120) to Ҁ (1152)
  329. } else if (U'ґ' <= character && character <= U'ҿ' && odd) { // ґ (1169) to ҿ (1215)
  330. return character - 1;// Ґ (1168) to Ҿ (1214)
  331. } else if (U'ӂ' <= character && character <= U'ӄ' && even) { // ӂ (1218) to ӄ (1220)
  332. return character - 1;// Ӂ (1217) to Ӄ (1219)
  333. } else if (character == U'ӈ') { // ӈ (1224)
  334. return U'Ӈ'; // Ӈ (1223)
  335. } else if (character == U'ӌ') { // ӌ (1228)
  336. return U'Ӌ'; // Ӌ (1227)
  337. } else if (U'ӑ' <= character && character <= U'ӫ' && odd) { // ӑ (1233) to ӫ (1259)
  338. return character - 1;// Ӑ (1232) to Ӫ (1258)
  339. } else if (U'ӯ' <= character && character <= U'ӵ' && odd) { // ӯ (1263) to ӵ (1269)
  340. return character - 1;// Ӯ (1262) to Ӵ (1268)
  341. } else if (character == U'ӹ') { // ӹ (1273)
  342. return U'Ӹ'; // Ӹ (1272)
  343. } else if (U'ա' <= character && character <= U'ֆ') { // ա (1377) to ֆ (1414)
  344. return character - 48;// Ա (1329) to Ֆ (1366)
  345. } else if (U'ა' <= character && character <= U'ჵ') { // ა (4304) to ჵ (4341)
  346. return character - 48;// Ⴀ (4256) to Ⴥ (4293)
  347. } else if (U'ḁ' <= character && character <= U'ẕ' && odd) { // ḁ (7681) to ẕ (7829)
  348. return character - 1;// Ḁ (7680) to Ẕ (7828)
  349. } else if (U'ạ' <= character && character <= U'ỹ' && odd) { // ạ (7841) to ỹ (7929)
  350. return character - 1;// Ạ (7840) to Ỹ (7928)
  351. } else if (U'ἀ' <= character && character <= U'ἇ') { // ἀ (7936) to ἇ (7943)
  352. return character + 8;// Ἀ (7944) to Ἇ (7951)
  353. } else if (U'ἐ' <= character && character <= U'ἕ') { // ἐ (7952) to ἕ (7957)
  354. return character + 8;// Ἐ (7960) to Ἕ (7965)
  355. } else if (U'ἠ' <= character && character <= U'ἧ') { // ἠ (7968) to ἧ (7975)
  356. return character + 8;// Ἠ (7976) to Ἧ (7983)
  357. } else if (U'ἰ' <= character && character <= U'ἷ') { // ἰ (7984) to ἷ (7991)
  358. return character + 8;// Ἰ (7992) to Ἷ (7999)
  359. } else if (U'ὀ' <= character && character <= U'ὅ') { // ὀ (8000) to ὅ (8005)
  360. return character + 8;// Ὀ (8008) to Ὅ (8013)
  361. } else if (U'ὑ' <= character && character <= U'ὗ' && odd) { // ὑ (8017) to ὗ (8023)
  362. return character + 8;// Ὑ (8025) to Ὗ (8031)
  363. } else if (U'ὠ' <= character && character <= U'ὧ') { // ὠ (8032) to ὧ (8039)
  364. return character + 8;// Ὠ (8040) to Ὧ (8047)
  365. } else if (U'ᾀ' <= character && character <= U'ᾇ') { // ᾀ (8064) to ᾇ (8071)
  366. return character + 8;// ᾈ (8072) to ᾏ (8079)
  367. } else if (U'ᾐ' <= character && character <= U'ᾗ') { // ᾐ (8080) to ᾗ (8087)
  368. return character + 8;// ᾘ (8088) to ᾟ (8095)
  369. } else if (U'ᾠ' <= character && character <= U'ᾧ') { // ᾠ (8096) to ᾧ (8103)
  370. return character + 8;// ᾨ (8104) to ᾯ (8111)
  371. } else if (U'ᾰ' <= character && character <= U'ᾱ') { // ᾰ (8112) to ᾱ (8113)
  372. return character + 8;// Ᾰ (8120) to Ᾱ (8121)
  373. } else if (U'ῐ' <= character && character <= U'ῑ') { // ῐ (8144) to ῑ (8145)
  374. return character + 8;// Ῐ (8152) to Ῑ (8153)
  375. } else if (U'ῠ' <= character && character <= U'ῡ') { // ῠ (8160) to ῡ (8161)
  376. return character + 8;// Ῠ (8168) to Ῡ (8169)
  377. } else if (U'ⓐ' <= character && character <= U'ⓩ') { // ⓐ (9424) to ⓩ (9449)
  378. return character - 26;// Ⓐ (9398) to Ⓩ (9423)
  379. } else if (U'a' <= character && character <= U'z') { // a (65345) to z (65370)
  380. return character - 32;// A (65313) to Z (65338)
  381. } else {
  382. return character;
  383. }
  384. }
  385. }
  386. DsrChar dsr::character_lowerCase(DsrChar character) {
  387. if (character < 256) {
  388. if (U'A' <= character && character <= U'Z') { // A (65) to Z (90)
  389. return character + 32; // a (97) to z (122)
  390. } else if (U'À' <= character && character <= U'Ö') { // À (192) to Ö (214)
  391. return character + 32; // à (224) to ö (246)
  392. } else if (U'Ø' <= character && character <= U'Þ') { // Ø (216) to Þ (222)
  393. return character + 32; // ø (248) to þ (254)
  394. } else {
  395. return character;
  396. }
  397. } else {
  398. bool odd = character & DsrChar(1);
  399. bool even = !odd;
  400. if (U'Ā' <= character && character <= U'Ķ' && even) { // Ā (256) to Ķ (310)
  401. return character + 1; // ā (257) to ķ (311)
  402. } else if (U'Ĺ' <= character && character <= U'Ň' && odd) { // Ĺ (313) to Ň (327)
  403. return character + 1; // ĺ (314) to ň (328)
  404. } else if (U'Ŋ' <= character && character <= U'Ŷ' && even) { // Ŋ (330) to Ŷ (374)
  405. return character + 1; // ŋ (331) to ŷ (375)
  406. } else if (character == U'Ÿ') { // Ÿ (376)
  407. return U'ÿ'; // ÿ (255)
  408. } else if (character == U'Ź') { // Ź (377)
  409. return U'ź'; // ź (378)
  410. } else if (character == U'Ż') { // Ż (379)
  411. return U'ż'; // ż (380)
  412. } else if (character == U'Ž') { // Ž (381)
  413. return U'ž'; // ž (382)
  414. } else if (character == U'Ɓ') { // Ɓ (385)
  415. return U'ɓ'; // ɓ (595)
  416. } else if (character == U'Ƃ') { // Ƃ (386)
  417. return U'ƃ'; // ƃ (387)
  418. } else if (character == U'Ƅ') { // Ƅ (388)
  419. return U'ƅ'; // ƅ (389)
  420. } else if (character == U'Ɔ') { // Ɔ (390)
  421. return U'ɔ'; // ɔ (596)
  422. } else if (character == U'Ƈ') { // Ƈ (391)
  423. return U'ƈ'; // ƈ (392)
  424. } else if (character == U'Ɖ') { // Ɖ (393)
  425. return U'ɖ'; // ɖ (598)
  426. } else if (character == U'Ɗ') { // Ɗ (394)
  427. return U'ɗ'; // ɗ (599)
  428. } else if (character == U'Ƌ') { // Ƌ (395)
  429. return U'ƌ'; // ƌ (396)
  430. } else if (character == U'Ǝ') { // Ǝ (398)
  431. return U'ɘ'; // ɘ (600)
  432. } else if (character == U'Ə') { // Ə (399)
  433. return U'ə'; // ə (601)
  434. } else if (character == U'Ɛ') { // Ɛ (400)
  435. return U'ɛ'; // ɛ (603)
  436. } else if (character == U'Ƒ') { // Ƒ (401)
  437. return U'ƒ'; // ƒ (402)
  438. } else if (character == U'Ɠ') { // Ɠ (403)
  439. return U'ɠ'; // ɠ (608)
  440. } else if (character == U'Ɣ') { // Ɣ (404)
  441. return U'ɣ'; // ɣ (611)
  442. } else if (character == U'Ɩ') { // Ɩ (406)
  443. return U'ɩ'; // ɩ (617)
  444. } else if (character == U'Ɨ') { // Ɨ (407)
  445. return U'ɨ'; // ɨ (616)
  446. } else if (character == U'Ƙ') { // Ƙ (408)
  447. return U'ƙ'; // ƙ (409)
  448. } else if (character == U'Ɯ') { // Ɯ (412)
  449. return U'ɯ'; // ɯ (623)
  450. } else if (character == U'Ɲ') { // Ɲ (413)
  451. return U'ɲ'; // ɲ (626)
  452. } else if (character == U'Ɵ') { // Ɵ (415)
  453. return U'ɵ'; // ɵ (629)
  454. } else if (character == U'Ơ') { // Ơ (416)
  455. return U'ơ'; // ơ (417)
  456. } else if (character == U'Ƣ') { // Ƣ (418)
  457. return U'ƣ'; // ƣ (419)
  458. } else if (character == U'Ƥ') { // Ƥ (420)
  459. return U'ƥ'; // ƥ (421)
  460. } else if (character == U'Ʀ') { // Ʀ (422)
  461. return U'ʀ'; // ʀ (640)
  462. } else if (character == U'Ƨ') { // Ƨ (423)
  463. return U'ƨ'; // ƨ (424)
  464. } else if (character == U'Ƭ') { // Ƭ (428)
  465. return U'ƭ'; // ƭ (429)
  466. } else if (character == U'Ʈ') { // Ʈ (430)
  467. return U'ʈ'; // ʈ (648)
  468. } else if (character == U'Ư') { // Ư (431)
  469. return U'ư'; // ư (432)
  470. } else if (character == U'Ʊ') { // Ʊ (433)
  471. return U'ʊ'; // ʊ (650)
  472. } else if (character == U'Ʋ') { // Ʋ (434)
  473. return U'ʋ'; // ʋ (651)
  474. } else if (character == U'Ƴ') { // Ƴ (435)
  475. return U'ƴ'; // ƴ (436)
  476. } else if (character == U'Ƶ') { // Ƶ (437)
  477. return U'ƶ'; // ƶ (438)
  478. } else if (character == U'Ʒ') { // Ʒ (439)
  479. return U'ʒ'; // ʒ (658)
  480. } else if (character == U'Ƹ') { // Ƹ (440)
  481. return U'ƹ'; // ƹ (441)
  482. } else if (character == U'Ƽ') { // Ƽ (444)
  483. return U'ƽ'; // ƽ (445)
  484. } else if (character == U'DŽ') { // DŽ (452)
  485. return U'dž'; // dž (454)
  486. } else if (character == U'Dž') { // Dž (453)
  487. return U'dž'; // dž (454)
  488. } else if (character == U'LJ') { // LJ (455)
  489. return U'lj'; // lj (457)
  490. } else if (character == U'Lj') { // Lj (456)
  491. return U'lj'; // lj (457)
  492. } else if (character == U'NJ') { // NJ (458)
  493. return U'nj'; // nj (460)
  494. } else if (U'Nj' <= character && character <= U'Ǜ' && odd) { // Nj (459) to Ǜ (475)
  495. return character + 1; // nj (460) to ǜ (476)
  496. } else if (U'Ǟ' <= character && character <= U'Ǯ' && even) { // Ǟ (478) to Ǯ (494)
  497. return character + 1; // ǟ (479) to ǯ (495)
  498. } else if (character == U'DZ') { // DZ (497)
  499. return U'dz'; // dz (499)
  500. } else if (character == U'Dz') { // Dz (498)
  501. return U'dz'; // dz (499)
  502. } else if (character == U'Ǵ') { // Ǵ (500)
  503. return U'ǵ'; // ǵ (501)
  504. } else if (character == U'Ƿ') { // Ƿ (503)
  505. return U'ƿ'; // ƿ (447)
  506. } else if (U'Ǹ' <= character && character <= U'Ȟ' && even) { // Ǹ (504) to Ȟ (542)
  507. return character + 1; // ǹ (505) to ȟ (543)
  508. } else if (character == U'Ƞ') { // Ƞ (544)
  509. return U'ƞ'; // ƞ (414)
  510. } else if (U'Ȣ' <= character && character <= U'Ȳ' && even) { // Ȣ (546) to Ȳ (562)
  511. return character + 1; // ȣ (547) to ȳ (563)
  512. } else if (character == U'Ȼ') { // Ȼ (571)
  513. return U'ȼ'; // ȼ (572)
  514. } else if (character == U'Ƚ') { // Ƚ (573)
  515. return U'ƚ'; // ƚ (410)
  516. } else if (character == U'Ɂ') { // Ɂ (577)
  517. return U'ɂ'; // ɂ (578)
  518. } else if (character == U'Ƀ') { // Ƀ (579)
  519. return U'ƀ'; // ƀ (384)
  520. } else if (character == U'Ʉ') { // Ʉ (580)
  521. return U'ʉ'; // ʉ (649)
  522. } else if (character == U'Ʌ') { // Ʌ (581)
  523. return U'ʌ'; // ʌ (652)
  524. } else if (U'Ɇ' <= character && character <= U'Ɏ' && even) { // Ɇ (582) to Ɏ (590)
  525. return character + 1;// ɇ (583) to ɏ (591)
  526. } else if (character == U'ʃ') { // ʃ (643)
  527. return U'Ʃ'; // Ʃ (425)
  528. } else if (character == U'ˀ') { // ˀ (704)
  529. return U'ʔ'; // ʔ (660)
  530. } else if (character == U'Ά') { // Ά (902)
  531. return U'ά'; // ά (940)
  532. } else if (U'Έ' <= character && character <= U'Ί') { // Έ (904) to Ί (906)
  533. return character + 37;// έ (941) to ί (943)
  534. } else if (character == U'Ό') { // Ό (908)
  535. return U'ό'; // ό (972)
  536. } else if (U'Ύ' <= character && character <= U'Ώ') { // Ύ (910) to Ώ (911)
  537. return character + 63;// ύ (973) to ώ (974)
  538. } else if (U'Α' <= character && character <= U'Ρ') { // Α (913) to Ρ (929)
  539. return character + 32;// α (945) to ρ (961)
  540. } else if (U'Σ' <= character && character <= U'Ϋ') { // Σ (931) to Ϋ (939)
  541. return character + 32;// σ (963) to ϋ (971)
  542. } else if (U'Ϣ' <= character && character <= U'Ϯ' && even) { // Ϣ (994) to Ϯ (1006)
  543. return character + 1;// ϣ (995) to ϯ (1007)
  544. } else if (U'Ё' <= character && character <= U'Ќ') { // Ё (1025) to Ќ (1036)
  545. return character + 80;// ё (1105) to ќ (1116)
  546. } else if (U'Ў' <= character && character <= U'Џ') { // Ў (1038) to Џ (1039)
  547. return character + 80;// ў (1118) to џ (1119)
  548. } else if (U'А' <= character && character <= U'Я') { // А (1040) to Я (1071)
  549. return character + 32;// а (1072) to я (1103)
  550. } else if (U'Ѡ' <= character && character <= U'Ҁ' && even) { // Ѡ (1120) to Ҁ (1152)
  551. return character + 1;// ѡ (1121) to ҁ (1153)
  552. } else if (U'Ґ' <= character && character <= U'Ҿ' && even) { // Ґ (1168) to Ҿ (1214)
  553. return character + 1;// ґ (1169) to ҿ (1215)
  554. } else if (U'Ӂ' <= character && character <= U'Ӄ' && odd) { // Ӂ (1217) to Ӄ (1219)
  555. return character + 1;// ӂ (1218) to ӄ (1220)
  556. } else if (character == U'Ӈ') { // Ӈ (1223)
  557. return U'ӈ'; // ӈ (1224)
  558. } else if (character == U'Ӌ') { // Ӌ (1227)
  559. return U'ӌ'; // ӌ (1228)
  560. } else if (U'Ӑ' <= character && character <= U'Ӫ' && even) { // Ӑ (1232) to Ӫ (1258)
  561. return character + 1;// ӑ (1233) to ӫ (1259)
  562. } else if (U'Ӯ' <= character && character <= U'Ӵ' && even) { // Ӯ (1262) to Ӵ (1268)
  563. return character + 1;// ӯ (1263) to ӵ (1269)
  564. } else if (character == U'Ӹ') { // Ӹ (1272)
  565. return U'ӹ'; // ӹ (1273)
  566. } else if (U'Ա' <= character && character <= U'Ֆ') { // Ա (1329) to Ֆ (1366)
  567. return character + 48;// ա (1377) to ֆ (1414)
  568. } else if (U'Ⴀ' <= character && character <= U'Ⴥ') { // Ⴀ (4256) to Ⴥ (4293)
  569. return character + 48;// ა (4304) to ჵ (4341)
  570. } else if (U'Ḁ' <= character && character <= U'Ẕ' && even) { // Ḁ (7680) to Ẕ (7828)
  571. return character + 1;// ḁ (7681) to ẕ (7829)
  572. } else if (character == U'ẞ') { // ẞ (7838)
  573. return U'ß'; // ß (223)
  574. } else if (U'Ạ' <= character && character <= U'Ỹ' && even) { // Ạ (7840) to Ỹ (7928)
  575. return character + 1;// ạ (7841) to ỹ (7929)
  576. } else if (U'Ἀ' <= character && character <= U'Ἇ') { // Ἀ (7944) to Ἇ (7951)
  577. return character - 8;// ἀ (7936) to ἇ (7943)
  578. } else if (U'Ἐ' <= character && character <= U'Ἕ') { // Ἐ (7960) to Ἕ (7965)
  579. return character - 8;// ἐ (7952) to ἕ (7957)
  580. } else if (U'Ἠ' <= character && character <= U'Ἧ') { // Ἠ (7976) to Ἧ (7983)
  581. return character - 8;// ἠ (7968) to ἧ (7975)
  582. } else if (U'Ἰ' <= character && character <= U'Ἷ') { // Ἰ (7992) to Ἷ (7999)
  583. return character - 8;// ἰ (7984) to ἷ (7991)
  584. } else if (U'Ὀ' <= character && character <= U'Ὅ') { // Ὀ (8008) to Ὅ (8013)
  585. return character - 8;// ὀ (8000) to ὅ (8005)
  586. } else if (U'Ὑ' <= character && character <= U'Ὗ' && odd) { // Ὑ (8025) to Ὗ (8031)
  587. return character - 8;// ὑ (8017) to ὗ (8023)
  588. } else if (U'Ὠ' <= character && character <= U'Ὧ') { // Ὠ (8040) to Ὧ (8047)
  589. return character - 8;// ὠ (8032) to ὧ (8039)
  590. } else if (U'ᾈ' <= character && character <= U'ᾏ') { // ᾈ (8072) to ᾏ (8079)
  591. return character - 8;// ᾀ (8064) to ᾇ (8071)
  592. } else if (U'ᾘ' <= character && character <= U'ᾟ') { // ᾘ (8088) to ᾟ (8095)
  593. return character - 8;// ᾐ (8080) to ᾗ (8087)
  594. } else if (U'ᾨ' <= character && character <= U'ᾯ') { // ᾨ (8104) to ᾯ (8111)
  595. return character - 8;// ᾠ (8096) to ᾧ (8103)
  596. } else if (U'Ᾰ' <= character && character <= U'Ᾱ') { // Ᾰ (8120) to Ᾱ (8121)
  597. return character - 8;// ᾰ (8112) to ᾱ (8113)
  598. } else if (U'Ῐ' <= character && character <= U'Ῑ') { // Ῐ (8152) to Ῑ (8153)
  599. return character - 8;// ῐ (8144) to ῑ (8145)
  600. } else if (U'Ῠ' <= character && character <= U'Ῡ') { // Ῠ (8168) to Ῡ (8169)
  601. return character - 8;// ῠ (8160) to ῡ (8161)
  602. } else if (U'Ⓐ' <= character && character <= U'Ⓩ') { // Ⓐ (9398) to Ⓩ (9423)
  603. return character + 26;// ⓐ (9424) to ⓩ (9449)
  604. } else if (character == U'Ɽ') { // Ɽ (11364)
  605. return U'ɽ'; // ɽ (637)
  606. } else if (character == U'Ɑ') { // Ɑ (11373)
  607. return U'ɑ'; // ɑ (593)
  608. } else if (character == U'Ɱ') { // Ɱ (11374)
  609. return U'ɱ'; // ɱ (625)
  610. } else if (character == U'Ɐ') { // Ɐ (11375)
  611. return U'ɐ'; // ɐ (592)
  612. } else if (character == U'Ɒ') { // Ɒ (11376)
  613. return U'ɒ'; // ɒ (594)
  614. } else if (U'Ȿ' <= character && character <= U'Ɀ') { // Ȿ (11390) to Ɀ (11391)
  615. return character - 10815;// ȿ (575) to ɀ (576)
  616. } else if (character == U'Ɥ') { // Ɥ (42893)
  617. return U'ɥ'; // ɥ (613)
  618. } else if (character == U'Ɪ') { // Ɪ (42926)
  619. return U'ɪ'; // ɪ (618)
  620. } else if (U'A' <= character && character <= U'Z') { // A (65313) to Z (65338)
  621. return character + 32;// a (65345) to z (65370)
  622. } else {
  623. return character;
  624. }
  625. }
  626. }
  627. String dsr::string_upperCase(const ReadableString &text) {
  628. String result;
  629. string_reserve(result, text.view.length);
  630. for (intptr_t i = 0; i < text.view.length; i++) {
  631. string_appendChar(result, character_upperCase(text[i]));
  632. }
  633. return result;
  634. }
  635. String dsr::string_lowerCase(const ReadableString &text) {
  636. String result;
  637. string_reserve(result, text.view.length);
  638. for (intptr_t i = 0; i < text.view.length; i++) {
  639. string_appendChar(result, character_lowerCase(text[i]));
  640. }
  641. return result;
  642. }
  643. bool dsr::string_match(const ReadableString& a, const ReadableString& b) {
  644. if (a.view.length != b.view.length) {
  645. return false;
  646. } else {
  647. for (intptr_t i = 0; i < a.view.length; i++) {
  648. if (a[i] != b[i]) {
  649. return false;
  650. }
  651. }
  652. return true;
  653. }
  654. }
  655. bool dsr::string_caseInsensitiveMatch(const ReadableString& a, const ReadableString& b) {
  656. if (a.view.length != b.view.length) {
  657. return false;
  658. } else {
  659. for (intptr_t i = 0; i < a.view.length; i++) {
  660. if (character_upperCase(a[i]) != character_upperCase(b[i])) {
  661. return false;
  662. }
  663. }
  664. return true;
  665. }
  666. }
  667. static intptr_t findFirstNonWhite(const ReadableString &text) {
  668. for (intptr_t i = 0; i < text.view.length; i++) {
  669. DsrChar c = text[i];
  670. if (!character_isWhiteSpace(c)) {
  671. return i;
  672. }
  673. }
  674. return -1;
  675. }
  676. static intptr_t findLastNonWhite(const ReadableString &text) {
  677. for (intptr_t i = text.view.length - 1; i >= 0; i--) {
  678. DsrChar c = text[i];
  679. if (!character_isWhiteSpace(c)) {
  680. return i;
  681. }
  682. }
  683. return -1;
  684. }
  685. // Allow passing literals without allocating heap memory for the result
  686. ReadableString dsr::string_removeOuterWhiteSpace(const ReadableString &text) {
  687. intptr_t first = findFirstNonWhite(text);
  688. intptr_t last = findLastNonWhite(text);
  689. if (first == -1) {
  690. // Only white space
  691. return ReadableString();
  692. } else {
  693. // Subset
  694. return string_inclusiveRange(text, first, last);
  695. }
  696. }
  697. String dsr::string_mangleQuote(const ReadableString &rawText) {
  698. String result;
  699. string_reserve(result, rawText.view.length + 2);
  700. string_appendChar(result, U'\"'); // Begin quote
  701. for (intptr_t i = 0; i < rawText.view.length; i++) {
  702. DsrChar c = rawText[i];
  703. if (c == U'\"') { // Double quote
  704. string_append(result, U"\\\"");
  705. } else if (c == U'\\') { // Backslash
  706. string_append(result, U"\\\\");
  707. } else if (c == U'\a') { // Audible bell
  708. string_append(result, U"\\a");
  709. } else if (c == U'\b') { // Backspace
  710. string_append(result, U"\\b");
  711. } else if (c == U'\f') { // Form feed
  712. string_append(result, U"\\f");
  713. } else if (c == U'\n') { // Line feed
  714. string_append(result, U"\\n");
  715. } else if (c == U'\r') { // Carriage return
  716. string_append(result, U"\\r");
  717. } else if (c == U'\t') { // Horizontal tab
  718. string_append(result, U"\\t");
  719. } else if (c == U'\v') { // Vertical tab
  720. string_append(result, U"\\v");
  721. } else if (c == U'\0') { // Null terminator
  722. string_append(result, U"\\0");
  723. } else {
  724. string_appendChar(result, c);
  725. }
  726. }
  727. string_appendChar(result, U'\"'); // End quote
  728. return result;
  729. }
  730. String dsr::string_unmangleQuote(const ReadableString& mangledText) {
  731. intptr_t firstQuote = string_findFirst(mangledText, '\"');
  732. intptr_t lastQuote = string_findLast(mangledText, '\"');
  733. String result;
  734. if (firstQuote == -1 || lastQuote == -1 || firstQuote == lastQuote) {
  735. throwError(U"Cannot unmangle using string_unmangleQuote without beginning and ending with quote signs!\n", mangledText, U"\n");
  736. } else {
  737. for (intptr_t i = firstQuote + 1; i < lastQuote; i++) {
  738. DsrChar c = mangledText[i];
  739. if (c == U'\\') { // Escape character
  740. DsrChar c2 = mangledText[i + 1];
  741. if (c2 == U'\"') { // Double quote
  742. string_appendChar(result, U'\"');
  743. } else if (c2 == U'\\') { // Back slash
  744. string_appendChar(result, U'\\');
  745. } else if (c2 == U'a') { // Audible bell
  746. string_appendChar(result, U'\a');
  747. } else if (c2 == U'b') { // Backspace
  748. string_appendChar(result, U'\b');
  749. } else if (c2 == U'f') { // Form feed
  750. string_appendChar(result, U'\f');
  751. } else if (c2 == U'n') { // Line feed
  752. string_appendChar(result, U'\n');
  753. } else if (c2 == U'r') { // Carriage return
  754. string_appendChar(result, U'\r');
  755. } else if (c2 == U't') { // Horizontal tab
  756. string_appendChar(result, U'\t');
  757. } else if (c2 == U'v') { // Vertical tab
  758. string_appendChar(result, U'\v');
  759. } else if (c2 == U'0') { // Null terminator
  760. string_appendChar(result, U'\0');
  761. }
  762. i++; // Consume both characters
  763. } else {
  764. // Detect bad input
  765. if (c == U'\"') { // Double quote
  766. throwError(U"Unmangled double quote sign detected in string_unmangleQuote!\n", mangledText, U"\n");
  767. } else if (c == U'\a') { // Audible bell
  768. throwError(U"Unmangled audible bell detected in string_unmangleQuote!\n", mangledText, U"\n");
  769. } else if (c == U'\b') { // Backspace
  770. throwError(U"Unmangled backspace detected in string_unmangleQuote!\n", mangledText, U"\n");
  771. } else if (c == U'\f') { // Form feed
  772. throwError(U"Unmangled form feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  773. } else if (c == U'\n') { // Line feed
  774. throwError(U"Unmangled line feed detected in string_unmangleQuote!\n", mangledText, U"\n");
  775. } else if (c == U'\r') { // Carriage return
  776. throwError(U"Unmangled carriage return detected in string_unmangleQuote!\n", mangledText, U"\n");
  777. } else if (c == U'\0') { // Null terminator
  778. throwError(U"Unmangled null terminator detected in string_unmangleQuote!\n", mangledText, U"\n");
  779. } else {
  780. string_appendChar(result, c);
  781. }
  782. }
  783. }
  784. }
  785. return result;
  786. }
  787. void dsr::string_fromUnsigned(String& target, uint64_t value) {
  788. static const int bufferSize = 20;
  789. DsrChar digits[bufferSize];
  790. int64_t usedSize = 0;
  791. if (value == 0) {
  792. string_appendChar(target, U'0');
  793. } else {
  794. while (usedSize < bufferSize) {
  795. DsrChar digit = U'0' + (value % 10u);
  796. digits[usedSize] = digit;
  797. usedSize++;
  798. value /= 10u;
  799. if (value == 0) {
  800. break;
  801. }
  802. }
  803. while (usedSize > 0) {
  804. usedSize--;
  805. string_appendChar(target, digits[usedSize]);
  806. }
  807. }
  808. }
  809. void dsr::string_fromSigned(String& target, int64_t value, DsrChar negationCharacter) {
  810. if (value >= 0) {
  811. string_fromUnsigned(target, (uint64_t)value);
  812. } else {
  813. string_appendChar(target, negationCharacter);
  814. string_fromUnsigned(target, (uint64_t)(-value));
  815. }
  816. }
  817. static const int MAX_DECIMALS = 16;
  818. static double decimalMultipliers[MAX_DECIMALS] = {
  819. 10.0,
  820. 100.0,
  821. 1000.0,
  822. 10000.0,
  823. 100000.0,
  824. 1000000.0,
  825. 10000000.0,
  826. 100000000.0,
  827. 1000000000.0,
  828. 10000000000.0,
  829. 100000000000.0,
  830. 1000000000000.0,
  831. 10000000000000.0,
  832. 100000000000000.0,
  833. 1000000000000000.0,
  834. 10000000000000000.0
  835. };
  836. static double roundingOffsets[MAX_DECIMALS] = {
  837. 0.05,
  838. 0.005,
  839. 0.0005,
  840. 0.00005,
  841. 0.000005,
  842. 0.0000005,
  843. 0.00000005,
  844. 0.000000005,
  845. 0.0000000005,
  846. 0.00000000005,
  847. 0.000000000005,
  848. 0.0000000000005,
  849. 0.00000000000005,
  850. 0.000000000000005,
  851. 0.0000000000000005,
  852. 0.00000000000000005
  853. };
  854. static uint64_t decimalLimits[MAX_DECIMALS] = {
  855. 9,
  856. 99,
  857. 999,
  858. 9999,
  859. 99999,
  860. 999999,
  861. 9999999,
  862. 99999999,
  863. 999999999,
  864. 9999999999,
  865. 99999999999,
  866. 999999999999,
  867. 9999999999999,
  868. 99999999999999,
  869. 999999999999999,
  870. 9999999999999999
  871. };
  872. void dsr::string_fromDouble(String& target, double value, int decimalCount, bool removeTrailingZeroes, DsrChar decimalCharacter, DsrChar negationCharacter) {
  873. if (decimalCount < 1) decimalCount = 1;
  874. if (decimalCount > MAX_DECIMALS) decimalCount = MAX_DECIMALS;
  875. double remainder = value;
  876. // Get negation
  877. if (remainder < 0.0) {
  878. string_appendChar(target, negationCharacter);
  879. remainder = -remainder;
  880. }
  881. // Apply an offset to make the following truncation round to the closest printable decimal.
  882. int offsetIndex = decimalCount - 1;
  883. remainder += roundingOffsets[offsetIndex];
  884. // Get whole part
  885. uint64_t whole = (uint64_t)remainder;
  886. string_fromUnsigned(target, whole);
  887. // Remove the whole part from the remainder.
  888. remainder = remainder - whole;
  889. // Print the decimal
  890. string_appendChar(target, decimalCharacter);
  891. // Get decimals
  892. uint64_t scaledDecimals = uint64_t(remainder * decimalMultipliers[offsetIndex]);
  893. // Limit decimals to all nines prevent losing a whole unit from fraction overflow.
  894. uint64_t limit = decimalLimits[offsetIndex];
  895. if (scaledDecimals > limit) scaledDecimals = limit;
  896. DsrChar digits[MAX_DECIMALS]; // Using 0 to decimalCount - 1
  897. int writeIndex = decimalCount - 1;
  898. for (int d = 0; d < decimalCount; d++) {
  899. int digit = scaledDecimals % 10;
  900. digits[writeIndex] = U'0' + digit;
  901. scaledDecimals = scaledDecimals / 10;
  902. writeIndex--;
  903. }
  904. if (removeTrailingZeroes) {
  905. // Find the last non-zero decimal, but keep at least one zero.
  906. int lastValue = 0;
  907. for (int d = 0; d < decimalCount; d++) {
  908. if (digits[d] != U'0') lastValue = d;
  909. }
  910. // Print until the last value or the only zero.
  911. for (int d = 0; d <= lastValue; d++) {
  912. string_appendChar(target, digits[d]);
  913. }
  914. } else {
  915. // Print fixed decimals.
  916. for (int d = 0; d < decimalCount; d++) {
  917. string_appendChar(target, digits[d]);
  918. }
  919. }
  920. }
  921. #define TO_RAW_ASCII(TARGET, SOURCE) \
  922. char TARGET[SOURCE.view.length + 1]; \
  923. for (intptr_t i = 0; i < SOURCE.view.length; i++) { \
  924. TARGET[i] = toAscii(SOURCE[i]); \
  925. } \
  926. TARGET[SOURCE.view.length] = '\0';
  927. // A function definition for receiving a stream of bytes
  928. // Instead of using std's messy inheritance
  929. using ByteWriterFunction = std::function<void(uint8_t value)>;
  930. // A function definition for receiving a stream of UTF-32 characters
  931. // Instead of using std's messy inheritance
  932. using UTF32WriterFunction = std::function<void(DsrChar character)>;
  933. // Filter out unwanted characters for improved portability
  934. static void feedCharacter(const UTF32WriterFunction &receiver, DsrChar character) {
  935. if (character != U'\0' && character != U'\r') {
  936. receiver(character);
  937. }
  938. }
  939. // Appends the content of buffer as a BOM-free Latin-1 file into target
  940. // fileLength is ignored when nullTerminated is true
  941. template <bool nullTerminated>
  942. static void feedStringFromFileBuffer_Latin1(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  943. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  944. DsrChar character = (DsrChar)(buffer[i]);
  945. if (nullTerminated && character == 0) { return; }
  946. feedCharacter(receiver, character);
  947. }
  948. }
  949. // Appends the content of buffer as a BOM-free UTF-8 file into target
  950. // fileLength is ignored when nullTerminated is true
  951. template <bool nullTerminated>
  952. static void feedStringFromFileBuffer_UTF8(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  953. for (intptr_t i = 0; i < fileLength || nullTerminated; i++) {
  954. uint8_t byteA = buffer[i];
  955. if (byteA < (uint32_t)0b10000000) {
  956. // Single byte (1xxxxxxx)
  957. if (nullTerminated && byteA == 0) { return; }
  958. feedCharacter(receiver, (DsrChar)byteA);
  959. } else {
  960. uint32_t character = 0;
  961. int extraBytes = 0;
  962. if (byteA >= (uint32_t)0b11000000) { // At least two leading ones
  963. if (byteA < (uint32_t)0b11100000) { // Less than three leading ones
  964. character = byteA & (uint32_t)0b00011111;
  965. extraBytes = 1;
  966. } else if (byteA < (uint32_t)0b11110000) { // Less than four leading ones
  967. character = byteA & (uint32_t)0b00001111;
  968. extraBytes = 2;
  969. } else if (byteA < (uint32_t)0b11111000) { // Less than five leading ones
  970. character = byteA & (uint32_t)0b00000111;
  971. extraBytes = 3;
  972. } else {
  973. // Invalid UTF-8 format
  974. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b111111xx!");
  975. }
  976. } else {
  977. // Invalid UTF-8 format
  978. throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b10xxxxxx!");
  979. }
  980. while (extraBytes > 0) {
  981. i += 1; uint32_t nextByte = buffer[i];
  982. character = (character << 6) | (nextByte & 0b00111111);
  983. extraBytes--;
  984. }
  985. feedCharacter(receiver, (DsrChar)character);
  986. }
  987. }
  988. }
  989. template <bool LittleEndian>
  990. uint16_t read16bits(const uint8_t* buffer, intptr_t startOffset) {
  991. uint16_t byteA = buffer[startOffset];
  992. uint16_t byteB = buffer[startOffset + 1];
  993. if (LittleEndian) {
  994. return (byteB << 8) | byteA;
  995. } else {
  996. return (byteA << 8) | byteB;
  997. }
  998. }
  999. // Appends the content of buffer as a BOM-free UTF-16 file into target as UTF-32
  1000. // fileLength is ignored when nullTerminated is true
  1001. template <bool LittleEndian, bool nullTerminated>
  1002. static void feedStringFromFileBuffer_UTF16(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength = 0) {
  1003. for (intptr_t i = 0; i < fileLength || nullTerminated; i += 2) {
  1004. // Read the first 16-bit word
  1005. uint16_t wordA = read16bits<LittleEndian>(buffer, i);
  1006. // Check if another word is needed
  1007. // Assuming that wordA >= 0x0000 and wordA <= 0xFFFF as uint16_t,
  1008. // we can just check if it's within the range reserved for 32-bit encoding
  1009. if (wordA <= 0xD7FF || wordA >= 0xE000) {
  1010. // Not in the reserved range, just a single 16-bit character
  1011. if (nullTerminated && wordA == 0) { return; }
  1012. feedCharacter(receiver, (DsrChar)wordA);
  1013. } else {
  1014. // The given range was reserved and therefore using 32 bits
  1015. i += 2;
  1016. uint16_t wordB = read16bits<LittleEndian>(buffer, i);
  1017. uint32_t higher10Bits = wordA & (uint32_t)0b1111111111;
  1018. uint32_t lower10Bits = wordB & (uint32_t)0b1111111111;
  1019. DsrChar finalChar = (DsrChar)(((higher10Bits << 10) | lower10Bits) + (uint32_t)0x10000);
  1020. feedCharacter(receiver, finalChar);
  1021. }
  1022. }
  1023. }
  1024. // Sends the decoded UTF-32 characters from the encoded buffer into target.
  1025. // The text encoding should be specified using a BOM at the start of buffer, otherwise Latin-1 is assumed.
  1026. static void feedStringFromFileBuffer(const UTF32WriterFunction &receiver, const uint8_t* buffer, intptr_t fileLength) {
  1027. // After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
  1028. if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
  1029. feedStringFromFileBuffer_UTF8<false>(receiver, buffer + 3, fileLength - 3);
  1030. } else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
  1031. feedStringFromFileBuffer_UTF16<false, false>(receiver, buffer + 2, fileLength - 2);
  1032. } else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
  1033. feedStringFromFileBuffer_UTF16<true, false>(receiver, buffer + 2, fileLength - 2);
  1034. } else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
  1035. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  1036. throwError(U"UTF-32 BE format is not yet supported!\n");
  1037. } else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
  1038. //feedStringFromFileBuffer_UTF32BE(receiver, buffer + 4, fileLength - 4);
  1039. throwError(U"UTF-32 LE format is not yet supported!\n");
  1040. } else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
  1041. //feedStringFromFileBuffer_UTF1(receiver, buffer + 3, fileLength - 3);
  1042. throwError(U"UTF-1 format is not yet supported!\n");
  1043. } else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
  1044. //feedStringFromFileBuffer_SCSU(receiver, buffer + 3, fileLength - 3);
  1045. throwError(U"SCSU format is not yet supported!\n");
  1046. } else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
  1047. //feedStringFromFileBuffer_BOCU-1(receiver, buffer + 3, fileLength - 3);
  1048. throwError(U"BOCU-1 format is not yet supported!\n");
  1049. } else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
  1050. // Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
  1051. throwError(U"UTF-7 format is not yet supported!\n");
  1052. } else {
  1053. // No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
  1054. feedStringFromFileBuffer_Latin1<false>(receiver, buffer, fileLength);
  1055. }
  1056. }
  1057. // Sends the decoded UTF-32 characters from the encoded null terminated buffer into target.
  1058. // buffer may not contain any BOM, and must be null terminated in the specified encoding.
  1059. static void feedStringFromRawData(const UTF32WriterFunction &receiver, const uint8_t* buffer, CharacterEncoding encoding) {
  1060. if (encoding == CharacterEncoding::Raw_Latin1) {
  1061. feedStringFromFileBuffer_Latin1<true>(receiver, buffer);
  1062. } else if (encoding == CharacterEncoding::BOM_UTF8) {
  1063. feedStringFromFileBuffer_UTF8<true>(receiver, buffer);
  1064. } else if (encoding == CharacterEncoding::BOM_UTF16BE) {
  1065. feedStringFromFileBuffer_UTF16<false, true>(receiver, buffer);
  1066. } else if (encoding == CharacterEncoding::BOM_UTF16LE) {
  1067. feedStringFromFileBuffer_UTF16<true, true>(receiver, buffer);
  1068. } else {
  1069. throwError(U"Unhandled encoding in feedStringFromRawData!\n");
  1070. }
  1071. }
  1072. String dsr::string_dangerous_decodeFromData(const void* data, CharacterEncoding encoding) {
  1073. String result;
  1074. // Measure the size of the result by scanning the content in advance
  1075. intptr_t characterCount = 0;
  1076. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  1077. characterCount++;
  1078. };
  1079. feedStringFromRawData(measurer, (const uint8_t*)data, encoding);
  1080. // Pre-allocate the correct amount of memory based on the simulation
  1081. string_reserve(result, characterCount);
  1082. // Stream output to the result string
  1083. UTF32WriterFunction receiver = [&result](DsrChar character) {
  1084. string_appendChar(result, character);
  1085. };
  1086. feedStringFromRawData(receiver, (const uint8_t*)data, encoding);
  1087. return result;
  1088. }
  1089. String dsr::string_loadFromMemory(Buffer fileContent) {
  1090. String result;
  1091. // Measure the size of the result by scanning the content in advance
  1092. intptr_t characterCount = 0;
  1093. UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
  1094. characterCount++;
  1095. };
  1096. feedStringFromFileBuffer(measurer, fileContent.getUnsafe(), fileContent.getUsedSize());
  1097. // Pre-allocate the correct amount of memory based on the simulation
  1098. string_reserve(result, characterCount);
  1099. // Stream output to the result string
  1100. UTF32WriterFunction receiver = [&result](DsrChar character) {
  1101. string_appendChar(result, character);
  1102. };
  1103. feedStringFromFileBuffer(receiver, fileContent.getUnsafe(), fileContent.getUsedSize());
  1104. return result;
  1105. }
  1106. // Loads a text file of unknown format
  1107. // Removes carriage-return characters to make processing easy with only line-feed for breaking lines
  1108. String dsr::string_load(const ReadableString& filename, bool mustExist) {
  1109. Buffer encoded = file_loadBuffer(filename, mustExist);
  1110. if (!buffer_exists(encoded)) {
  1111. return String();
  1112. } else {
  1113. return string_loadFromMemory(encoded);
  1114. }
  1115. }
  1116. template <CharacterEncoding characterEncoding>
  1117. static void encodeCharacter(const ByteWriterFunction &receiver, DsrChar character) {
  1118. if (characterEncoding == CharacterEncoding::Raw_Latin1) {
  1119. // Replace any illegal characters with questionmarks
  1120. if (character > 255) { character = U'?'; }
  1121. receiver(character);
  1122. } else if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  1123. // Replace any illegal characters with questionmarks
  1124. if (character > 0x10FFFF) { character = U'?'; }
  1125. if (character < (1 << 7)) {
  1126. // 0xxxxxxx
  1127. receiver(character);
  1128. } else if (character < (1 << 11)) {
  1129. // 110xxxxx 10xxxxxx
  1130. receiver((uint32_t)0b11000000 | ((character & ((uint32_t)0b11111 << 6)) >> 6));
  1131. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1132. } else if (character < (1 << 16)) {
  1133. // 1110xxxx 10xxxxxx 10xxxxxx
  1134. receiver((uint32_t)0b11100000 | ((character & ((uint32_t)0b1111 << 12)) >> 12));
  1135. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  1136. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1137. } else if (character < (1 << 21)) {
  1138. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  1139. receiver((uint32_t)0b11110000 | ((character & ((uint32_t)0b111 << 18)) >> 18));
  1140. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 12)) >> 12));
  1141. receiver((uint32_t)0b10000000 | ((character & ((uint32_t)0b111111 << 6)) >> 6));
  1142. receiver((uint32_t)0b10000000 | (character & (uint32_t)0b111111));
  1143. }
  1144. } else { // Assuming UTF-16
  1145. if (character > 0x10FFFF) { character = U'?'; }
  1146. if (character <= 0xD7FF || (character >= 0xE000 && character <= 0xFFFF)) {
  1147. // xxxxxxxx xxxxxxxx (Limited range)
  1148. uint32_t higher8Bits = (character & (uint32_t)0b1111111100000000) >> 8;
  1149. uint32_t lower8Bits = character & (uint32_t)0b0000000011111111;
  1150. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1151. receiver(higher8Bits);
  1152. receiver(lower8Bits);
  1153. } else { // Assuming UTF-16 LE
  1154. receiver(lower8Bits);
  1155. receiver(higher8Bits);
  1156. }
  1157. } else if (character >= 0x010000 && character <= 0x10FFFF) {
  1158. // 110110xxxxxxxxxx 110111xxxxxxxxxx
  1159. uint32_t code = character - (uint32_t)0x10000;
  1160. uint32_t byteA = ((code & (uint32_t)0b11000000000000000000) >> 18) | (uint32_t)0b11011000;
  1161. uint32_t byteB = (code & (uint32_t)0b00111111110000000000) >> 10;
  1162. uint32_t byteC = ((code & (uint32_t)0b00000000001100000000) >> 8) | (uint32_t)0b11011100;
  1163. uint32_t byteD = code & (uint32_t)0b00000000000011111111;
  1164. if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1165. receiver(byteA);
  1166. receiver(byteB);
  1167. receiver(byteC);
  1168. receiver(byteD);
  1169. } else { // Assuming UTF-16 LE
  1170. receiver(byteB);
  1171. receiver(byteA);
  1172. receiver(byteD);
  1173. receiver(byteC);
  1174. }
  1175. }
  1176. }
  1177. }
  1178. // Template for encoding a whole string
  1179. template <CharacterEncoding characterEncoding, LineEncoding lineEncoding>
  1180. static void encodeText(const ByteWriterFunction &receiver, String content, bool writeBOM, bool writeNullTerminator) {
  1181. if (writeBOM) {
  1182. // Write byte order marks
  1183. if (characterEncoding == CharacterEncoding::BOM_UTF8) {
  1184. receiver(0xEF);
  1185. receiver(0xBB);
  1186. receiver(0xBF);
  1187. } else if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
  1188. receiver(0xFE);
  1189. receiver(0xFF);
  1190. } else if (characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  1191. receiver(0xFF);
  1192. receiver(0xFE);
  1193. }
  1194. }
  1195. // Write encoded content
  1196. for (intptr_t i = 0; i < string_length(content); i++) {
  1197. DsrChar character = content[i];
  1198. if (character == U'\n') {
  1199. if (lineEncoding == LineEncoding::CrLf) {
  1200. encodeCharacter<characterEncoding>(receiver, U'\r');
  1201. encodeCharacter<characterEncoding>(receiver, U'\n');
  1202. } else { // Assuming that lineEncoding == LineEncoding::Lf
  1203. encodeCharacter<characterEncoding>(receiver, U'\n');
  1204. }
  1205. } else {
  1206. encodeCharacter<characterEncoding>(receiver, character);
  1207. }
  1208. }
  1209. if (writeNullTerminator) {
  1210. // Terminate internal strings with \0 to prevent getting garbage data after unpadded buffers
  1211. if (characterEncoding == CharacterEncoding::BOM_UTF16BE || characterEncoding == CharacterEncoding::BOM_UTF16LE) {
  1212. receiver(0);
  1213. receiver(0);
  1214. } else {
  1215. receiver(0);
  1216. }
  1217. }
  1218. }
  1219. // Macro for converting run-time arguments into template arguments for encodeText
  1220. #define ENCODE_TEXT(RECEIVER, CONTENT, CHAR_ENCODING, LINE_ENCODING, WRITE_BOM, WRITE_NULL_TERMINATOR) \
  1221. if (CHAR_ENCODING == CharacterEncoding::Raw_Latin1) { \
  1222. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1223. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::CrLf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  1224. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1225. encodeText<CharacterEncoding::Raw_Latin1, LineEncoding::Lf>(RECEIVER, CONTENT, false, WRITE_NULL_TERMINATOR); \
  1226. } \
  1227. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF8) { \
  1228. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1229. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1230. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1231. encodeText<CharacterEncoding::BOM_UTF8, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1232. } \
  1233. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16BE) { \
  1234. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1235. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1236. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1237. encodeText<CharacterEncoding::BOM_UTF16BE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1238. } \
  1239. } else if (CHAR_ENCODING == CharacterEncoding::BOM_UTF16LE) { \
  1240. if (LINE_ENCODING == LineEncoding::CrLf) { \
  1241. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::CrLf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1242. } else if (LINE_ENCODING == LineEncoding::Lf) { \
  1243. encodeText<CharacterEncoding::BOM_UTF16LE, LineEncoding::Lf>(RECEIVER, CONTENT, WRITE_BOM, WRITE_NULL_TERMINATOR); \
  1244. } \
  1245. }
  1246. // Encoding to a buffer before saving all at once as a binary file.
  1247. // This tells the operating system how big the file is in advance and prevent the worst case of stalling for minutes!
  1248. bool dsr::string_save(const ReadableString& filename, const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding) {
  1249. Buffer buffer = string_saveToMemory(content, characterEncoding, lineEncoding);
  1250. if (buffer_exists(buffer)) {
  1251. return file_saveBuffer(filename, buffer);
  1252. } else {
  1253. return false;
  1254. }
  1255. }
  1256. Buffer dsr::string_saveToMemory(const ReadableString& content, CharacterEncoding characterEncoding, LineEncoding lineEncoding, bool writeByteOrderMark, bool writeNullTerminator) {
  1257. intptr_t byteCount = 0;
  1258. ByteWriterFunction counter = [&byteCount](uint8_t value) {
  1259. byteCount++;
  1260. };
  1261. ENCODE_TEXT(counter, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  1262. Buffer result = buffer_create(byteCount).setName("Buffer holding an encoded string");
  1263. SafePointer<uint8_t> byteWriter = buffer_getSafeData<uint8_t>(result, "Buffer for string encoding");
  1264. ByteWriterFunction receiver = [&byteWriter](uint8_t value) {
  1265. *byteWriter = value;
  1266. byteWriter += 1;
  1267. };
  1268. ENCODE_TEXT(receiver, content, characterEncoding, lineEncoding, writeByteOrderMark, writeNullTerminator);
  1269. return result;
  1270. }
  1271. static uintptr_t getStartOffset(const ReadableString &source) {
  1272. // Get the allocation
  1273. const uint8_t* origin = (uint8_t*)(source.characters.getUnsafe());
  1274. const uint8_t* start = (uint8_t*)(source.view.getUnchecked());
  1275. assert(start <= origin);
  1276. // Get the offset from the parent
  1277. return (start - origin) / sizeof(DsrChar);
  1278. }
  1279. #ifdef SAFE_POINTER_CHECKS
  1280. static void serializeCharacterBuffer(PrintCharacter target, void const * const allocation, uintptr_t maxLength) {
  1281. uintptr_t characterCount = heap_getUsedSize(allocation) / sizeof(DsrChar);
  1282. target(U'\"');
  1283. for (uintptr_t c = 0; c < characterCount; c++) {
  1284. if (c == maxLength) {
  1285. target(U'\"');
  1286. target(U'.');
  1287. target(U'.');
  1288. target(U'.');
  1289. return;
  1290. }
  1291. target(((DsrChar *)allocation)[c]);
  1292. }
  1293. target(U'\"');
  1294. }
  1295. #endif
  1296. static Handle<DsrChar> allocateCharacters(intptr_t minimumLength) {
  1297. // Allocate memory.
  1298. Handle<DsrChar> result = handle_createArray<DsrChar>(AllocationInitialization::Uninitialized, minimumLength).setName("String characters");
  1299. #ifdef SAFE_POINTER_CHECKS
  1300. setAllocationSerialization(result.getUnsafe(), &serializeCharacterBuffer);
  1301. #endif
  1302. // Check how much space we got.
  1303. uintptr_t availableSpace = heap_getAllocationSize(result.getUnsafe());
  1304. // Expand to use all available memory in the allocation.
  1305. uintptr_t newSize = heap_setUsedSize(result.getUnsafe(), availableSpace);
  1306. // Clear the memory to zeroes, just to be safe against non-deterministic bugs.
  1307. safeMemorySet(result.getSafe("Cleared String pointer"), 0, newSize);
  1308. return result;
  1309. }
  1310. // Replaces the buffer with a new buffer holding at least minimumLength characters
  1311. // Guarantees that the new buffer is not shared by other strings, so that it may be written to freely
  1312. static void reallocateBuffer(String &target, intptr_t minimumLength, bool preserve) {
  1313. // Holding oldData alive while copying to the new buffer
  1314. Handle<DsrChar> oldBuffer = target.characters; // Kept for reference counting only, do not remove.
  1315. Impl_CharacterView oldData = target.view;
  1316. target.characters = allocateCharacters(minimumLength);
  1317. target.view = Impl_CharacterView(target.characters.getUnsafe(), oldData.length);
  1318. if (preserve && oldData.length > 0) {
  1319. safeMemoryCopy(target.view.getSafe("New characters being copied from an old buffer"), oldData.getSafe("Old characters being copied to a new buffer"), oldData.length * sizeof(DsrChar));
  1320. }
  1321. }
  1322. // Call before writing to the buffer.
  1323. // This hides that Strings share buffers when assigning by value or taking partial strings.
  1324. static void cloneIfNeeded(String &target) {
  1325. // If there is no buffer or the buffer is shared, it needs to allocate its own buffer.
  1326. if (target.characters.isNull() || target.characters.getUseCount() > 1) {
  1327. reallocateBuffer(target, target.view.length, true);
  1328. }
  1329. }
  1330. void dsr::string_clear(String& target) {
  1331. // We we start writing from the beginning, then we must have our own allocation to avoid overwriting the characters in other strings.
  1332. cloneIfNeeded(target);
  1333. target.view.length = 0;
  1334. }
  1335. // The number of DsrChar characters that can be contained in the allocation before reaching the buffer's end
  1336. // This doesn't imply that it's always okay to write to the remaining space, because the buffer may be shared
  1337. static intptr_t getCapacity(const ReadableString &source) {
  1338. if (source.characters.isNotNull()) {
  1339. uintptr_t bufferElements = source.characters.getElementCount();
  1340. // Subtract offset from the buffer size to get the remaining space
  1341. return bufferElements - getStartOffset(source);
  1342. } else {
  1343. return 0;
  1344. }
  1345. }
  1346. static void expand(String &target, intptr_t newLength, bool affectUsedLength) {
  1347. cloneIfNeeded(target);
  1348. if (newLength > target.view.length) {
  1349. if (newLength > getCapacity(target)) {
  1350. reallocateBuffer(target, newLength, true);
  1351. }
  1352. if (affectUsedLength) {
  1353. target.view.length = newLength;
  1354. }
  1355. }
  1356. }
  1357. void dsr::string_reserve(String& target, intptr_t minimumLength) {
  1358. expand(target, minimumLength, false);
  1359. }
  1360. // This macro has to be used because a static template wouldn't be able to inherit access to private methods from the target class.
  1361. // Better to use a macro without type safety in the implementation than to expose yet another template in a global header.
  1362. // Proof that appending to one string doesn't affect another:
  1363. // If it has to reallocate
  1364. // * Then it will have its own buffer without conflicts
  1365. // If it doesn't have to reallocate
  1366. // If it shares the buffer
  1367. // If source is empty
  1368. // * Then no risk of overwriting neighbor strings if we don't write
  1369. // If source isn't empty
  1370. // * Then the buffer will be cloned when the first character is written
  1371. // If it doesn't share the buffer
  1372. // * Then no risk of writing
  1373. #define APPEND(TARGET, SOURCE, LENGTH, MASK) { \
  1374. intptr_t oldLength = (TARGET).view.length; \
  1375. expand((TARGET), oldLength + (intptr_t)(LENGTH), true); \
  1376. for (intptr_t i = 0; i < (intptr_t)(LENGTH); i++) { \
  1377. (TARGET).view.writeCharacter(oldLength + i, ((SOURCE)[i]) & MASK); \
  1378. } \
  1379. }
  1380. // TODO: See if ascii litterals can be checked for values above 127 in compile-time
  1381. static void atomic_append_ascii(String &target, const char* source) { APPEND(target, source, strlen(source), 0xFF); }
  1382. // TODO: Use memcpy when appending input of the same format
  1383. static void atomic_append_readable(String &target, const ReadableString& source) { APPEND(target, source, source.view.length, 0xFFFFFFFF); }
  1384. static void atomic_append_utf32(String &target, const DsrChar* source) { APPEND(target, source, strlen_utf32(source), 0xFFFFFFFF); }
  1385. void dsr::string_appendChar(String& target, DsrChar value) { APPEND(target, &value, 1, 0xFFFFFFFF); }
  1386. String& dsr::impl_toStreamIndented_ascii(String& target, const char *value, const ReadableString& indentation) {
  1387. atomic_append_readable(target, indentation);
  1388. atomic_append_ascii(target, value);
  1389. return target;
  1390. }
  1391. String& dsr::impl_toStreamIndented_utf32(String& target, const char32_t *value, const ReadableString& indentation) {
  1392. atomic_append_readable(target, indentation);
  1393. atomic_append_utf32(target, value);
  1394. return target;
  1395. }
  1396. String& dsr::impl_toStreamIndented_readable(String& target, const ReadableString& value, const ReadableString& indentation) {
  1397. atomic_append_readable(target, indentation);
  1398. atomic_append_readable(target, value);
  1399. return target;
  1400. }
  1401. String& dsr::impl_toStreamIndented_double(String& target, const double &value, const ReadableString& indentation) {
  1402. atomic_append_readable(target, indentation);
  1403. string_fromDouble(target, (double)value);
  1404. return target;
  1405. }
  1406. String& dsr::impl_toStreamIndented_int64(String& target, const int64_t &value, const ReadableString& indentation) {
  1407. atomic_append_readable(target, indentation);
  1408. string_fromSigned(target, value);
  1409. return target;
  1410. }
  1411. String& dsr::impl_toStreamIndented_uint64(String& target, const uint64_t &value, const ReadableString& indentation) {
  1412. atomic_append_readable(target, indentation);
  1413. string_fromUnsigned(target, value);
  1414. return target;
  1415. }
  1416. // The print mutex makes sure that messages from multiple threads don't get mixed up.
  1417. static std::mutex printMutex;
  1418. static std::ostream& toStream(std::ostream& out, const ReadableString &source) {
  1419. for (intptr_t i = 0; i < source.view.length; i++) {
  1420. out.put(toAscii(source.view[i]));
  1421. }
  1422. return out;
  1423. }
  1424. static const std::function<void(const ReadableString &message, MessageType type)> defaultMessageAction = [](const ReadableString &message, MessageType type) {
  1425. if (type == MessageType::Error) {
  1426. #ifdef DSR_HARD_EXIT_ON_ERROR
  1427. // Print the error.
  1428. toStream(std::cerr, message);
  1429. // Free all heap allocations.
  1430. heap_hardExitCleaning();
  1431. // Terminate with a non-zero value to indicate failure.
  1432. std::exit(1);
  1433. #else
  1434. Buffer ascii = string_saveToMemory(message, CharacterEncoding::Raw_Latin1, LineEncoding::CrLf, false, true);
  1435. throw std::runtime_error((char*)ascii.getUnsafe());
  1436. #endif
  1437. } else {
  1438. printMutex.lock();
  1439. toStream(std::cout, message);
  1440. printMutex.unlock();
  1441. }
  1442. };
  1443. static std::function<void(const ReadableString &message, MessageType type)> globalMessageAction = defaultMessageAction;
  1444. void dsr::string_sendMessage(const ReadableString &message, MessageType type) {
  1445. globalMessageAction(message, type);
  1446. }
  1447. void dsr::string_sendMessage_default(const ReadableString &message, MessageType type) {
  1448. defaultMessageAction(message, type);
  1449. }
  1450. void dsr::string_assignMessageHandler(std::function<void(const ReadableString &message, MessageType type)> newHandler) {
  1451. globalMessageAction = newHandler;
  1452. }
  1453. void dsr::string_unassignMessageHandler() {
  1454. globalMessageAction = defaultMessageAction;
  1455. }
  1456. void dsr::string_split_callback(std::function<void(ReadableString separatedText)> action, const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  1457. intptr_t sectionStart = 0;
  1458. for (intptr_t i = 0; i < source.view.length; i++) {
  1459. DsrChar c = source[i];
  1460. if (c == separator) {
  1461. ReadableString element = string_exclusiveRange(source, sectionStart, i);
  1462. if (removeWhiteSpace) {
  1463. action(string_removeOuterWhiteSpace(element));
  1464. } else {
  1465. action(element);
  1466. }
  1467. sectionStart = i + 1;
  1468. }
  1469. }
  1470. if (source.view.length > sectionStart) {
  1471. if (removeWhiteSpace) {
  1472. action(string_removeOuterWhiteSpace(string_exclusiveRange(source, sectionStart, source.view.length)));
  1473. } else {
  1474. action(string_exclusiveRange(source, sectionStart, source.view.length));
  1475. }
  1476. }
  1477. }
  1478. static String createSubString(const Handle<DsrChar> &characters, const Impl_CharacterView &view) {
  1479. String result;
  1480. result.characters = characters;
  1481. result.view = view;
  1482. return result;
  1483. }
  1484. List<String> dsr::string_split(const ReadableString& source, DsrChar separator, bool removeWhiteSpace) {
  1485. List<String> result;
  1486. if (source.view.length > 0) {
  1487. // Re-use the existing buffer
  1488. String commonBuffer = createSubString(source.characters, source.view);
  1489. // Source is allocated as String
  1490. string_split_callback([&result, removeWhiteSpace](String element) {
  1491. if (removeWhiteSpace) {
  1492. result.push(string_removeOuterWhiteSpace(element));
  1493. } else {
  1494. result.push(element);
  1495. }
  1496. }, commonBuffer, separator, removeWhiteSpace);
  1497. }
  1498. return result;
  1499. }
  1500. intptr_t dsr::string_splitCount(const ReadableString& source, DsrChar separator) {
  1501. intptr_t result = 0;
  1502. string_split_callback([&result](ReadableString element) {
  1503. result++;
  1504. }, source, separator);
  1505. return result;
  1506. }
  1507. int64_t dsr::string_toInteger(const ReadableString& source) {
  1508. int64_t result;
  1509. bool negated;
  1510. result = 0;
  1511. negated = false;
  1512. for (intptr_t i = 0; i < source.view.length; i++) {
  1513. DsrChar c = source[i];
  1514. if (c == '-' || c == '~') {
  1515. negated = !negated;
  1516. } else if (c >= '0' && c <= '9') {
  1517. result = (result * 10) + (int)(c - '0');
  1518. } else if (c == ',' || c == '.') {
  1519. // Truncate any decimals by ignoring them
  1520. break;
  1521. }
  1522. }
  1523. if (negated) {
  1524. return -result;
  1525. } else {
  1526. return result;
  1527. }
  1528. }
  1529. double dsr::string_toDouble(const ReadableString& source) {
  1530. double result;
  1531. bool negated;
  1532. bool reachedDecimal;
  1533. int64_t digitDivider;
  1534. result = 0.0;
  1535. negated = false;
  1536. reachedDecimal = false;
  1537. digitDivider = 1;
  1538. for (intptr_t i = 0; i < source.view.length; i++) {
  1539. DsrChar c = source[i];
  1540. if (c == '-' || c == '~') {
  1541. negated = !negated;
  1542. } else if (c >= '0' && c <= '9') {
  1543. if (reachedDecimal) {
  1544. digitDivider = digitDivider * 10;
  1545. result = result + ((double)(c - '0') / (double)digitDivider);
  1546. } else {
  1547. result = (result * 10) + (double)(c - '0');
  1548. }
  1549. } else if (c == ',' || c == '.') {
  1550. reachedDecimal = true;
  1551. } else if (c == 'e' || c == 'E') {
  1552. // Apply the exponent after 'e'.
  1553. result *= std::pow(10.0, string_toInteger(string_after(source, i)));
  1554. // Skip remaining characters.
  1555. i = source.view.length;
  1556. }
  1557. }
  1558. if (negated) {
  1559. return -result;
  1560. } else {
  1561. return result;
  1562. }
  1563. }
  1564. intptr_t dsr::string_length(const ReadableString& source) {
  1565. return source.view.length;
  1566. }
  1567. intptr_t dsr::string_findFirst(const ReadableString& source, DsrChar toFind, intptr_t startIndex) {
  1568. for (intptr_t i = startIndex; i < source.view.length; i++) {
  1569. if (source[i] == toFind) {
  1570. return i;
  1571. }
  1572. }
  1573. return -1;
  1574. }
  1575. intptr_t dsr::string_findLast(const ReadableString& source, DsrChar toFind) {
  1576. for (intptr_t i = source.view.length - 1; i >= 0; i--) {
  1577. if (source[i] == toFind) {
  1578. return i;
  1579. }
  1580. }
  1581. return -1;
  1582. }
  1583. ReadableString dsr::string_exclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t exclusiveEnd) {
  1584. // Return empty string for each complete miss
  1585. if (inclusiveStart >= source.view.length || exclusiveEnd <= 0) { return ReadableString(); }
  1586. // Automatically clamping to valid range
  1587. if (inclusiveStart < 0) { inclusiveStart = 0; }
  1588. if (exclusiveEnd > source.view.length) { exclusiveEnd = source.view.length; }
  1589. // Return the overlapping interval
  1590. return createSubString(source.characters, Impl_CharacterView(source.view.getUnchecked() + inclusiveStart, exclusiveEnd - inclusiveStart));
  1591. }
  1592. ReadableString dsr::string_inclusiveRange(const ReadableString& source, intptr_t inclusiveStart, intptr_t inclusiveEnd) {
  1593. return string_exclusiveRange(source, inclusiveStart, inclusiveEnd + 1);
  1594. }
  1595. ReadableString dsr::string_before(const ReadableString& source, intptr_t exclusiveEnd) {
  1596. return string_exclusiveRange(source, 0, exclusiveEnd);
  1597. }
  1598. ReadableString dsr::string_until(const ReadableString& source, intptr_t inclusiveEnd) {
  1599. return string_inclusiveRange(source, 0, inclusiveEnd);
  1600. }
  1601. ReadableString dsr::string_from(const ReadableString& source, intptr_t inclusiveStart) {
  1602. return string_exclusiveRange(source, inclusiveStart, source.view.length);
  1603. }
  1604. ReadableString dsr::string_after(const ReadableString& source, intptr_t exclusiveStart) {
  1605. return string_from(source, exclusiveStart + 1);
  1606. }
  1607. bool dsr::character_isDigit(DsrChar c) {
  1608. return c >= U'0' && c <= U'9';
  1609. }
  1610. bool dsr::character_isIntegerCharacter(DsrChar c) {
  1611. return c == U'-' || character_isDigit(c);
  1612. }
  1613. bool dsr::character_isValueCharacter(DsrChar c) {
  1614. return c == U'.' || character_isIntegerCharacter(c);
  1615. }
  1616. bool dsr::character_isWhiteSpace(DsrChar c) {
  1617. return c == U' ' || c == U'\t' || c == U'\v' || c == U'\f' || c == U'\n' || c == U'\r';
  1618. }
  1619. // Macros for implementing regular expressions with a greedy approach consuming the first match
  1620. // Optional accepts 0 or 1 occurence
  1621. // Forced accepts 1 occurence
  1622. // Star accepts 0..N occurence
  1623. // Plus accepts 1..N occurence
  1624. #define CHARACTER_OPTIONAL(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; }
  1625. #define CHARACTER_FORCED(CHARACTER) if (source[readIndex] == CHARACTER) { readIndex++; } else { return false; }
  1626. #define CHARACTER_STAR(CHARACTER) while (source[readIndex] == CHARACTER) { readIndex++; }
  1627. #define CHARACTER_PLUS(CHARACTER) CHARACTER_FORCED(CHARACTER) CHARACTER_STAR(CHARACTER)
  1628. #define PATTERN_OPTIONAL(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1629. #define PATTERN_FORCED(PATTERN) if (character_is##PATTERN(source[readIndex])) { readIndex++; } else { return false; }
  1630. #define PATTERN_STAR(PATTERN) while (character_is##PATTERN(source[readIndex])) { readIndex++; }
  1631. #define PATTERN_PLUS(PATTERN) PATTERN_FORCED(PATTERN) PATTERN_STAR(PATTERN)
  1632. // The greedy approach works here, because there's no ambiguity
  1633. bool dsr::string_isInteger(const ReadableString& source, bool allowWhiteSpace) {
  1634. intptr_t readIndex = 0;
  1635. if (allowWhiteSpace) {
  1636. PATTERN_STAR(WhiteSpace);
  1637. }
  1638. CHARACTER_OPTIONAL(U'-');
  1639. // At least one digit required
  1640. PATTERN_PLUS(IntegerCharacter);
  1641. if (allowWhiteSpace) {
  1642. PATTERN_STAR(WhiteSpace);
  1643. }
  1644. return readIndex == source.view.length;
  1645. }
  1646. // To avoid consuming the all digits on Digit* before reaching Digit+ when there is no decimal, whole integers are judged by string_isInteger
  1647. bool dsr::string_isDouble(const ReadableString& source, bool allowWhiteSpace) {
  1648. // Solving the UnsignedDouble <- Digit+ | Digit* '.' Digit+ ambiguity is done easiest by checking if there's a decimal before handling the white-space and negation
  1649. if (string_findFirst(source, U'.') == -1) {
  1650. // No decimal detected
  1651. return string_isInteger(source, allowWhiteSpace);
  1652. } else {
  1653. intptr_t readIndex = 0;
  1654. if (allowWhiteSpace) {
  1655. PATTERN_STAR(WhiteSpace);
  1656. }
  1657. // Double <- UnsignedDouble | '-' UnsignedDouble
  1658. CHARACTER_OPTIONAL(U'-');
  1659. // UnsignedDouble <- Digit* '.' Digit+
  1660. // Any number of integer digits
  1661. PATTERN_STAR(IntegerCharacter);
  1662. // Only dot for decimal
  1663. CHARACTER_FORCED(U'.')
  1664. // At least one decimal digit
  1665. PATTERN_PLUS(IntegerCharacter);
  1666. if (allowWhiteSpace) {
  1667. PATTERN_STAR(WhiteSpace);
  1668. }
  1669. return readIndex == source.view.length;
  1670. }
  1671. }
  1672. uintptr_t dsr::string_getBufferUseCount(const ReadableString& text) {
  1673. return text.characters.getUseCount();
  1674. }