stringAPI.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2020 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. #ifndef DFPSR_API_STRING
  24. #define DFPSR_API_STRING
  25. #include <cstdint>
  26. #include <iostream>
  27. #include <sstream>
  28. #include <functional>
  29. #include "bufferAPI.h"
  30. #include "../collection/List.h"
  31. // Define DFPSR_INTERNAL_ACCESS before any include to get internal access to exposed types
  32. #ifdef DFPSR_INTERNAL_ACCESS
  33. #define IMPL_ACCESS public
  34. #else
  35. #define IMPL_ACCESS protected
  36. #endif
  37. namespace dsr {
  38. using DsrChar = char32_t;
  39. // Text files support loading UTF-8/16 BE/LE with BOM or Latin-1 without BOM
  40. enum class CharacterEncoding {
  41. Raw_Latin1, // U+00 to U+FF
  42. BOM_UTF8, // U+00000000 to U+0010FFFF
  43. BOM_UTF16BE, // U+00000000 to U+0000D7FF, U+0000E000 to U+0010FFFF
  44. BOM_UTF16LE // U+00000000 to U+0000D7FF, U+0000E000 to U+0010FFFF
  45. };
  46. // Carriage-return is removed when loading text files to prevent getting double lines
  47. // A line-feed without a line-feed character is nonsense
  48. // LineEncoding allow re-adding carriage-return before or after each line-break when saving
  49. enum class LineEncoding {
  50. CrLf, // Microsoft Windows compatible (Can also be read on other platforms by ignoring carriage return)
  51. Lf // Linux and Macintosh compatible (Might not work on non-portable text editors on Microsoft Windows)
  52. };
  53. class String;
  54. // Replacing String with a ReadableString reference for input arguments can make passing of U"" literals faster.
  55. // Unlike String, it cannot be constructed from a "" literal,
  56. // because it's not allowed to create a new allocation for the UTF-32 conversion.
  57. class ReadableString {
  58. IMPL_ACCESS:
  59. // A reference counted pointer to the buffer to allow passing strings around without having to clone the buffer each time
  60. // ReadableString only uses it for reference counting but String use it for reallocating
  61. Buffer buffer;
  62. const char32_t* readSection = nullptr;
  63. int64_t length = 0;
  64. public:
  65. // Returning the character by value prevents writing to memory that might be a constant literal or shared with other strings
  66. DsrChar operator[] (int64_t index) const;
  67. public:
  68. // Empty string U""
  69. ReadableString();
  70. // Implicit casting from U"text"
  71. ReadableString(const DsrChar *content);
  72. // Create from String by sharing the buffer
  73. ReadableString(const String& source);
  74. // Destructor
  75. virtual ~ReadableString();
  76. public:
  77. // Converting to unknown character encoding using only the ascii character subset
  78. // A bug in GCC linking forces these to be virtual
  79. virtual std::ostream& toStream(std::ostream& out) const;
  80. virtual std::string toStdString() const;
  81. };
  82. // Used as format tags around numbers passed to string_append or string_combine
  83. // New types can implement printing to String by making wrappers from this class
  84. class Printable {
  85. public:
  86. // The method for appending the printable object into the target string
  87. virtual String& toStreamIndented(String& target, const ReadableString& indentation) const = 0;
  88. String& toStream(String& target) const;
  89. String toStringIndented(const ReadableString& indentation) const;
  90. String toString() const;
  91. std::ostream& toStreamIndented(std::ostream& out, const ReadableString& indentation) const;
  92. std::ostream& toStream(std::ostream& out) const;
  93. std::string toStdString() const;
  94. virtual ~Printable();
  95. };
  96. // A safe and simple string type
  97. // Can be constructed from ascii literals "", but U"" will preserve unicode characters.
  98. // Can be used without ReadableString, but ReadableString can be wrapped over U"" literals without allocation
  99. // UTF-32
  100. // Endianness is native
  101. // No combined characters allowed, use precomposed instead, so that the strings can guarantee a fixed character size
  102. class String : public ReadableString {
  103. IMPL_ACCESS:
  104. // Same as readSection, but with write access for appending more text
  105. char32_t* writeSection = nullptr;
  106. public:
  107. // Constructors
  108. String();
  109. String(const char* source);
  110. String(const char32_t* source);
  111. String(const std::string& source);
  112. String(const ReadableString& source);
  113. String(const String& source);
  114. };
  115. // Define this overload for non-virtual source types that cannot inherit from Printable
  116. String& string_toStreamIndented(String& target, const Printable& source, const ReadableString& indentation);
  117. String& string_toStreamIndented(String& target, const char* value, const ReadableString& indentation);
  118. String& string_toStreamIndented(String& target, const ReadableString& value, const ReadableString& indentation);
  119. String& string_toStreamIndented(String& target, const char32_t* value, const ReadableString& indentation);
  120. String& string_toStreamIndented(String& target, const float& value, const ReadableString& indentation);
  121. String& string_toStreamIndented(String& target, const double& value, const ReadableString& indentation);
  122. String& string_toStreamIndented(String& target, const int64_t& value, const ReadableString& indentation);
  123. String& string_toStreamIndented(String& target, const uint64_t& value, const ReadableString& indentation);
  124. String& string_toStreamIndented(String& target, const int32_t& value, const ReadableString& indentation);
  125. String& string_toStreamIndented(String& target, const uint32_t& value, const ReadableString& indentation);
  126. String& string_toStreamIndented(String& target, const int16_t& value, const ReadableString& indentation);
  127. String& string_toStreamIndented(String& target, const uint16_t& value, const ReadableString& indentation);
  128. String& string_toStreamIndented(String& target, const int8_t& value, const ReadableString& indentation);
  129. String& string_toStreamIndented(String& target, const uint8_t& value, const ReadableString& indentation);
  130. // Templates reused for all types
  131. // The source must inherit from Printable or have its own string_toStreamIndented overload
  132. template<typename T>
  133. String& string_toStream(String& target, const T& source) {
  134. return string_toStreamIndented(target, source, U"");
  135. }
  136. template<typename T>
  137. String string_toStringIndented(const T& source, const ReadableString& indentation) {
  138. String result;
  139. string_toStreamIndented(result, source, indentation);
  140. return result;
  141. }
  142. template<typename T>
  143. String string_toString(const T& source) {
  144. return string_toStringIndented(source, U"");
  145. }
  146. template<typename T>
  147. std::ostream& string_toStreamIndented(std::ostream& target, const T& source, const ReadableString& indentation) {
  148. return target << string_toStringIndented(source, indentation);
  149. }
  150. template<typename T>
  151. std::ostream& string_toStream(std::ostream& target, const T& source) {
  152. return target << string_toString(source);
  153. }
  154. // ---------------- Procedural API ----------------
  155. // Sets the target string's length to zero.
  156. // Because this opens up to appending new text where sub-string may already share the buffer,
  157. // this operation will reallocate the buffer if shared with other strings.
  158. void string_clear(String& target);
  159. // Post-condition: Returns the length of source.
  160. // Example: string_length(U"ABC") == 3
  161. int64_t string_length(const ReadableString& source);
  162. // Post-condition: Returns the base-zero index of source's first occurence of toFind, starting from startIndex. Returns -1 if not found.
  163. // Example: string_findFirst(U"ABCABCABC", U'A') == 0
  164. // Example: string_findFirst(U"ABCABCABC", U'B') == 1
  165. // Example: string_findFirst(U"ABCABCABC", U'C') == 2
  166. // Example: string_findFirst(U"ABCABCABC", U'D') == -1
  167. int64_t string_findFirst(const ReadableString& source, DsrChar toFind, int64_t startIndex = 0);
  168. // Post-condition: Returns the base-zero index of source's last occurence of toFind. Returns -1 if not found.
  169. // Example: string_findLast(U"ABCABCABC", U'A') == 6
  170. // Example: string_findLast(U"ABCABCABC", U'B') == 7
  171. // Example: string_findLast(U"ABCABCABC", U'C') == 8
  172. // Example: string_findLast(U"ABCABCABC", U'D') == -1
  173. int64_t string_findLast(const ReadableString& source, DsrChar toFind);
  174. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to before the character at exclusiveEnd
  175. // Example: string_exclusiveRange(U"0123456789", 2, 4) == U"23"
  176. ReadableString string_exclusiveRange(const ReadableString& source, int64_t inclusiveStart, int64_t exclusiveEnd);
  177. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to after the character at inclusiveEnd
  178. // Example: string_inclusiveRange(U"0123456789", 2, 4) == U"234"
  179. ReadableString string_inclusiveRange(const ReadableString& source, int64_t inclusiveStart, int64_t inclusiveEnd);
  180. // Post-condition: Returns a sub-string of source from the start to before the character at exclusiveEnd
  181. // Example: string_before(U"0123456789", 5) == U"01234"
  182. ReadableString string_before(const ReadableString& source, int64_t exclusiveEnd);
  183. // Post-condition: Returns a sub-string of source from the start to after the character at inclusiveEnd
  184. // Example: string_until(U"0123456789", 5) == U"012345"
  185. ReadableString string_until(const ReadableString& source, int64_t inclusiveEnd);
  186. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to the end
  187. // Example: string_from(U"0123456789", 5) == U"56789"
  188. ReadableString string_from(const ReadableString& source, int64_t inclusiveStart);
  189. // Post-condition: Returns a sub-string of source from after the character at exclusiveStart to the end
  190. // Example: string_after(U"0123456789", 5) == U"6789"
  191. ReadableString string_after(const ReadableString& source, int64_t exclusiveStart);
  192. // Split source into a list of strings.
  193. // Post-condition:
  194. // Returns a list of strings from source by splitting along separator.
  195. // If removeWhiteSpace is true then surrounding white-space will be removed, otherwise all white-space is kept.
  196. // The separating characters are excluded from the resulting strings.
  197. // The number of strings returned in the list will equal the number of separating characters plus one, so the result may contain empty strings.
  198. // Each string in the list clones content to its own dynamic buffer. Use string_split_callback if you don't need long term storage.
  199. List<String> string_split(const ReadableString& source, DsrChar separator, bool removeWhiteSpace = false);
  200. // Split a string without needing a list to store the result.
  201. // Use string_splitCount on the same source and separator if you need to know the element count in advance.
  202. // Side-effects:
  203. // Calls action for each sub-string divided by separator in source given as the separatedText argument.
  204. void string_split_callback(std::function<void(ReadableString separatedText)> action, const ReadableString& source, DsrChar separator, bool removeWhiteSpace = false);
  205. // An alternative overload for having a very long lambda at the end.
  206. inline void string_split_callback(const ReadableString& source, DsrChar separator, bool removeWhiteSpace, std::function<void(ReadableString separatedText)> action) {
  207. string_split_callback(action, source, separator, removeWhiteSpace);
  208. }
  209. // Split source using separator, only to return the number of splits.
  210. // Useful for pre-allocation.
  211. int64_t string_splitCount(const ReadableString& source, DsrChar separator);
  212. // Post-condition: Returns true iff c is a digit.
  213. // Digit <- '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  214. bool character_isDigit(DsrChar c);
  215. // Post-condition: Returns true iff c is an integer character.
  216. // IntegerCharacter <- '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  217. bool character_isIntegerCharacter(DsrChar c);
  218. // Post-condition: Returns true iff c is a value character.
  219. // ValueCharacter <- '.' | '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  220. bool character_isValueCharacter(DsrChar c);
  221. // Post-condition: Returns true iff c is a white-space character.
  222. // WhiteSpace <- ' ' | '\t' | '\v' | '\f' | '\n' | '\r'
  223. // Null terminators are excluded, because it's reserved for out of bound results.
  224. bool character_isWhiteSpace(DsrChar c);
  225. // Post-condition: Returns true iff source is a valid integer. IntegerAllowingWhiteSpace is also allowed iff allowWhiteSpace is true.
  226. // UnsignedInteger <- Digit+
  227. // Integer <- UnsignedInteger | '-' UnsignedInteger
  228. // IntegerAllowingWhiteSpace <- WhiteSpace* Integer WhiteSpace*
  229. bool string_isInteger(const ReadableString& source, bool allowWhiteSpace = true);
  230. // Post-condition: Returns true iff source is a valid integer or decimal number. DoubleAllowingWhiteSpace is also allowed iff allowWhiteSpace is true.
  231. // UnsignedDouble <- Digit+ | Digit* '.' Digit+
  232. // Double <- UnsignedDouble | '-' UnsignedDouble
  233. // DoubleAllowingWhiteSpace <- WhiteSpace* Double WhiteSpace*
  234. // Only dots are allowed as decimals.
  235. // Because being able to read files from another country without crashes is a lot more important than a detail that most people don't even notice.
  236. // Automatic nationalization made sense when most applications were written in-house before the internet existed.
  237. bool string_isDouble(const ReadableString& source, bool allowWhiteSpace = true);
  238. // Pre-condition: source must be a valid integer according to string_isInteger. Otherwise unexpected characters are simply ignored.
  239. // Post-condition: Returns the integer representation of source.
  240. // The result is signed, because the input might unexpectedly have a negation sign.
  241. // The result is large, so that one can easily check the range before assigning to a smaller integer type.
  242. int64_t string_toInteger(const ReadableString& source);
  243. // Pre-condition: source must be a valid double according to string_isDouble. Otherwise unexpected characters are simply ignored.
  244. // Post-condition: Returns the double precision floating-point representation of source.
  245. double string_toDouble(const ReadableString& source);
  246. // Loading will try to find a byte order mark and can handle UTF-8 and UTF-16.
  247. // Failure to find a byte order mark will assume that the file's content is raw Latin-1,
  248. // because automatic detection would cause random behaviour.
  249. // For portability, carriage return characters are removed,
  250. // but will be generated again using the default CrLf line encoding of string_save.
  251. // Post-condition:
  252. // Returns the content of the file referred to be filename.
  253. // If mustExist is true, then failure to load will throw an exception.
  254. // If mustExist is false, then failure to load will return an empty string.
  255. // If you want to handle files that are not found in a different way,
  256. // it is easy to use buffer_load and string_loadFromMemory separatelly.
  257. String string_load(const ReadableString& filename, bool mustExist = true);
  258. // Decode a text file from a buffer, which can be loaded using buffer_load.
  259. String string_loadFromMemory(Buffer fileContent);
  260. // Decode a null terminated string without BOM, by specifying which format it was encoded in.
  261. // Pre-conditions:
  262. // data does not start with any byte-order-mark (BOM).
  263. // data must be null terminated with '\0' in whatever format is being used. Otherwise you may have random crashes
  264. // Post-condition:
  265. // Returns a string decoded from the raw data.
  266. String string_dangerous_decodeFromData(const void* data, CharacterEncoding encoding);
  267. // Side-effect: Saves content to filename using the selected character and line encodings.
  268. // Post-condition: Returns true on success and false on failure.
  269. // Do not add carriage return characters yourself into strings, for these will be added automatically in the CrLf mode.
  270. // The internal String type should only use UTF-32 with single line feeds for breaking lines.
  271. // This makes text processing algorithms a lot cleaner when a character or line break is always one element.
  272. // UTF-8 with BOM is default by being both compact and capable of storing 21 bits of unicode.
  273. bool string_save(const ReadableString& filename, const ReadableString& content,
  274. CharacterEncoding characterEncoding = CharacterEncoding::BOM_UTF8,
  275. LineEncoding lineEncoding = LineEncoding::CrLf
  276. );
  277. // Encode the string and keep the raw buffer instead of saving it to a file.
  278. // Disabling writeByteOrderMark can be done when the result is casted to a native string for platform specific APIs, where a BOM is not allowed.
  279. // Enabling writeNullTerminator should be done when using the result as a pointer, so that the length is known when the buffer does not have padding.
  280. Buffer string_saveToMemory(const ReadableString& content,
  281. CharacterEncoding characterEncoding = CharacterEncoding::BOM_UTF8,
  282. LineEncoding lineEncoding = LineEncoding::CrLf,
  283. bool writeByteOrderMark = true,
  284. bool writeNullTerminator = false
  285. );
  286. // Post-condition: Returns true iff strings a and b are exactly equal.
  287. bool string_match(const ReadableString& a, const ReadableString& b);
  288. // Post-condition: Returns true iff strings a and b are roughly equal using a case insensitive match.
  289. bool string_caseInsensitiveMatch(const ReadableString& a, const ReadableString& b);
  290. // While string_match should be preferred over == for code readability and consistency with string_caseInsensitiveMatch,
  291. // the equality operator might be called automatically from template methods when a template type is a string.
  292. inline bool operator==(const ReadableString& a, const ReadableString& b) { return string_match(a, b); }
  293. inline bool operator!=(const ReadableString& a, const ReadableString& b) { return !string_match(a, b); }
  294. // Post-condition: Returns text converted to upper case.
  295. String string_upperCase(const ReadableString &text);
  296. // Post-condition: Returns text converted to lower case.
  297. String string_lowerCase(const ReadableString &text);
  298. // Post-condition: Returns a sub-set of text without surrounding white-space (space, tab and carriage-return).
  299. ReadableString string_removeOuterWhiteSpace(const ReadableString &text);
  300. // Post-condition: Returns rawText wrapped in a quote.
  301. // Special characters are included using escape characters, so that one can quote multiple lines but store it easily.
  302. String string_mangleQuote(const ReadableString &rawText);
  303. // Pre-condition: mangledText must be enclosed in double quotes and special characters must use escape characters (tabs in quotes are okay though).
  304. // Post-condition: Returns mangledText with quotes removed and excape tokens interpreted.
  305. String string_unmangleQuote(const ReadableString& mangledText);
  306. // Post-condition: Returns the number of strings using the same buffer, including itself.
  307. int64_t string_getBufferUseCount(const ReadableString& text);
  308. // Ensures safely that at least minimumLength characters can he held in the buffer
  309. void string_reserve(String& target, int64_t minimumLength);
  310. // Append/push one character (to avoid integer to string conversion)
  311. void string_appendChar(String& target, DsrChar value);
  312. // Append one element
  313. template<typename TYPE>
  314. inline void string_append(String& target, const TYPE &value) {
  315. string_toStream(target, value);
  316. }
  317. // Append multiple elements
  318. template<typename HEAD, typename... TAIL>
  319. inline void string_append(String& target, HEAD head, TAIL&&... tail) {
  320. string_append(target, head);
  321. string_append(target, tail...);
  322. }
  323. // Combine a number of strings, characters and numbers
  324. // If an input type is rejected, create a Printable object to wrap around it
  325. template<typename... ARGS>
  326. inline String string_combine(ARGS&&... args) {
  327. String result;
  328. string_append(result, args...);
  329. return result;
  330. }
  331. // ---------------- Infix syntax ----------------
  332. // Operations
  333. inline String operator+ (const ReadableString& a, const ReadableString& b) { return string_combine(a, b); }
  334. inline String operator+ (const char32_t* a, const ReadableString& b) { return string_combine(a, b); }
  335. inline String operator+ (const ReadableString& a, const char32_t* b) { return string_combine(a, b); }
  336. inline String operator+ (const String& a, const String& b) { return string_combine(a, b); }
  337. inline String operator+ (const char32_t* a, const String& b) { return string_combine(a, b); }
  338. inline String operator+ (const String& a, const char32_t* b) { return string_combine(a, b); }
  339. inline String operator+ (const String& a, const ReadableString& b) { return string_combine(a, b); }
  340. inline String operator+ (const ReadableString& a, const String& b) { return string_combine(a, b); }
  341. // ---------------- Message handling ----------------
  342. enum class MessageType {
  343. Error, // Terminate as quickly as possible after saving and informing the user.
  344. Warning, // Inform the user but let the caller continue.
  345. StandardPrinting, // Print text to the terminal.
  346. DebugPrinting // Print debug information to the terminal, if debug mode is active.
  347. };
  348. // Send a message
  349. void string_sendMessage(const ReadableString &message, MessageType type);
  350. // Send a message directly to the default message handler, ignoring string_assignMessageHandler.
  351. void string_sendMessage_default(const ReadableString &message, MessageType type);
  352. // Get a message
  353. // Pre-condition:
  354. // The action function must throw an exception or terminate the program when given an error, otherwise string_sendMessage will throw an exception about failing to do so.
  355. // Do not call string_sendMessage directly or indirectly from within action, use string_sendMessage_default instead to avoid infinite recursion.
  356. // Terminating the program as soon as possible is ideal, but one might want to save a backup or show what went wrong in a graphical interface before terminating.
  357. // Do not throw and catch errors as if they were warnings, because throwing and catching creates a partial transaction, potentially violating type invariants.
  358. // Better to use warnings and let the sender of the warning figure out how to abort the action safely.
  359. void string_assignMessageHandler(std::function<void(const ReadableString &message, MessageType type)> action);
  360. // Undo string_assignMessageHandler, so that any messages will be handled the default way again.
  361. void string_unassignMessageHandler();
  362. // Throw an error, which must terminate the application or throw an error
  363. template<typename... ARGS>
  364. void throwError(ARGS... args) {
  365. String result = string_combine(args...);
  366. string_sendMessage(result, MessageType::Error);
  367. }
  368. // Send a warning, which might throw an exception, terminate the application or anything else that the application requests using string_handleMessages
  369. template<typename... ARGS>
  370. void sendWarning(ARGS... args) {
  371. String result = string_combine(args...);
  372. string_sendMessage(result, MessageType::Warning);
  373. }
  374. // Print information to the terminal or something else listening for messages using string_handleMessages
  375. template<typename... ARGS>
  376. void printText(ARGS... args) {
  377. String result = string_combine(args...);
  378. string_sendMessage(result, MessageType::StandardPrinting);
  379. }
  380. // Debug messages are automatically disabled in release mode, so that you don't have to worry about accidentally releasing a program with poor performance from constantly printing to the terminal
  381. // Useful for selectively printing the most important information accumulated over time
  382. // Less useful for profiling, because the debug mode is slower than the release mode
  383. #ifdef NDEBUG
  384. // Supress debugText in release mode
  385. template<typename... ARGS>
  386. void debugText(ARGS... args) {}
  387. #else
  388. // Print debugText in debug mode
  389. template<typename... ARGS>
  390. void debugText(ARGS... args) {
  391. String result = string_combine(args...);
  392. string_sendMessage(result, MessageType::DebugPrinting);
  393. }
  394. #endif
  395. }
  396. #endif