text.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2020 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. #ifndef DFPSR_BASE_TEXT
  24. #define DFPSR_BASE_TEXT
  25. #include <stdint.h>
  26. #include <string>
  27. // TODO: Try to hide in the implementation
  28. #include <iostream>
  29. #include <sstream>
  30. #include "../api/bufferAPI.h"
  31. #include "../collection/List.h"
  32. namespace dsr {
  33. using DsrChar = char32_t;
  34. // Text files support loading UTF-8/16 BE/LE with BOM or Latin-1 without BOM
  35. enum class CharacterEncoding {
  36. Raw_Latin1, // U+00 to U+FF
  37. BOM_UTF8, // U+00000000 to U+0010FFFF
  38. BOM_UTF16BE, // U+00000000 to U+0000D7FF, U+0000E000 to U+0010FFFF
  39. BOM_UTF16LE // U+00000000 to U+0000D7FF, U+0000E000 to U+0010FFFF
  40. };
  41. // Carriage-return is removed when loading text files to prevent getting double lines
  42. // A line-feed without a line-feed character is nonsense
  43. // LineEncoding allow re-adding carriage-return before or after each line-break when saving
  44. enum class LineEncoding {
  45. CrLf, // Microsoft Windows compatible (Can also be read on other platforms by ignoring carriage return)
  46. Lf // Linux and Macintosh compatible (Might not work on non-portable text editors on Microsoft Windows)
  47. };
  48. class ReadableString {
  49. protected:
  50. // A local pointer to the sub-allocation
  51. const char32_t* readSection = nullptr;
  52. // The length of the current string in characters
  53. int sectionLength = 0;
  54. public:
  55. int length() const;
  56. DsrChar read(int index) const;
  57. // Returning the character by value prevents writing to memory that might be a constant literal or shared with other strings
  58. DsrChar operator[] (int index) const;
  59. public:
  60. // Empty string
  61. ReadableString();
  62. // Destructor
  63. virtual ~ReadableString();
  64. // UTF-32 litteral from U""
  65. // WARNING! May crash if content is freed, even if ReadableString is freed before
  66. // ReadableString may share its buffer with sub-strings of the same type
  67. ReadableString(const DsrChar *content);
  68. protected:
  69. // Returns true iff the range is safely inside of the string
  70. bool checkBound(int start, int length, bool warning = true) const;
  71. // Internal constructor
  72. ReadableString(const DsrChar *content, int sectionLength);
  73. public:
  74. // Create a string from an existing string
  75. // When there's no reference counter, it's important that the memory remains allocated until the application terminates
  76. // Just like when reading elements in a for loop, out-of-range only causes an exception if length > 0
  77. // Length lesser than 1 will always return an empty string
  78. virtual ReadableString getRange(int start, int length) const;
  79. // Converting to unknown character encoding using only the ascii character subset
  80. // A bug in GCC linking forces these to be virtual
  81. virtual std::ostream& toStream(std::ostream& out) const;
  82. virtual std::string toStdString() const;
  83. };
  84. class String;
  85. // Used as format tags around numbers passed to string_append or string_combine
  86. // New types can implement printing to String by making wrappers from this class
  87. class Printable {
  88. public:
  89. // The method for appending the printable object into the target string
  90. virtual String& toStreamIndented(String& target, const ReadableString& indentation) const = 0;
  91. String& toStream(String& target) const;
  92. String toStringIndented(const ReadableString& indentation) const;
  93. String toString() const;
  94. std::ostream& toStreamIndented(std::ostream& out, const ReadableString& indentation) const;
  95. std::ostream& toStream(std::ostream& out) const;
  96. std::string toStdString() const;
  97. virtual ~Printable();
  98. };
  99. // A safe and simple string type
  100. // Can be constructed from ascii litterals "", but U"" is more universal
  101. // Can be used without ReadableString, but ReadableString can be wrapped over U"" litterals without allocation
  102. // UTF-32
  103. // Endianness is native
  104. // No combined characters allowed, use precomposed instead, so that the strings can guarantee a fixed character size
  105. class String : public ReadableString {
  106. protected:
  107. // A reference counted pointer to the buffer, just to keep the allocation
  108. Buffer buffer;
  109. // Same as readSection, but with write access
  110. char32_t* writeSection = nullptr;
  111. // Internal constructor
  112. String(Buffer buffer, DsrChar *content, int sectionLength);
  113. public:
  114. // The number of DsrChar characters that can be contained in the allocation before reaching the buffer's end
  115. // This doesn't imply that it's always okay to write to the remaining space, because the buffer may be shared
  116. int capacity();
  117. // Create a string from the existing buffer without allocating any heap memory
  118. ReadableString getRange(int start, int length) const override;
  119. private:
  120. // Replaces the buffer with a new buffer holding at least newLength characters
  121. // Guarantees that the new buffer is not shared by other strings, so that it may be written to freely
  122. void reallocateBuffer(int32_t newLength, bool preserve);
  123. // Call before writing to the buffer
  124. // This hides that Strings share buffers when assigning by value or taking partial strings
  125. void cloneIfShared();
  126. void expand(int32_t newLength, bool affectUsedLength);
  127. public:
  128. // Constructors
  129. String();
  130. String(const char* source);
  131. String(const char32_t* source);
  132. String(const std::string& source);
  133. String(const ReadableString& source);
  134. String(const String& source);
  135. public:
  136. // Ensures safely that at least minimumLength characters can he held in the buffer
  137. void reserve(int32_t minimumLength);
  138. // Extend the String using more text
  139. void append(const char* source);
  140. void append(const ReadableString& source);
  141. void append(const char32_t* source);
  142. void append(const std::string& source);
  143. // Extend the String using another character
  144. void appendChar(DsrChar source);
  145. public:
  146. // Access
  147. void write(int index, DsrChar value);
  148. void clear();
  149. };
  150. // Define this overload for non-virtual source types that cannot inherit from Printable
  151. String& string_toStreamIndented(String& target, const Printable& source, const ReadableString& indentation);
  152. String& string_toStreamIndented(String& target, const char* value, const ReadableString& indentation);
  153. String& string_toStreamIndented(String& target, const ReadableString& value, const ReadableString& indentation);
  154. String& string_toStreamIndented(String& target, const char32_t* value, const ReadableString& indentation);
  155. String& string_toStreamIndented(String& target, const std::string& value, const ReadableString& indentation);
  156. String& string_toStreamIndented(String& target, const float& value, const ReadableString& indentation);
  157. String& string_toStreamIndented(String& target, const double& value, const ReadableString& indentation);
  158. String& string_toStreamIndented(String& target, const int64_t& value, const ReadableString& indentation);
  159. String& string_toStreamIndented(String& target, const uint64_t& value, const ReadableString& indentation);
  160. String& string_toStreamIndented(String& target, const int32_t& value, const ReadableString& indentation);
  161. String& string_toStreamIndented(String& target, const uint32_t& value, const ReadableString& indentation);
  162. String& string_toStreamIndented(String& target, const int16_t& value, const ReadableString& indentation);
  163. String& string_toStreamIndented(String& target, const uint16_t& value, const ReadableString& indentation);
  164. String& string_toStreamIndented(String& target, const int8_t& value, const ReadableString& indentation);
  165. String& string_toStreamIndented(String& target, const uint8_t& value, const ReadableString& indentation);
  166. // Templates reused for all types
  167. // The source must inherit from Printable or have its own string_feedIndented overload
  168. template<typename T>
  169. String& string_toStream(String& target, const T& source) {
  170. return string_toStreamIndented(target, source, U"");
  171. }
  172. template<typename T>
  173. String string_toStringIndented(const T& source, const ReadableString& indentation) {
  174. String result;
  175. string_toStreamIndented(result, source, indentation);
  176. return result;
  177. }
  178. template<typename T>
  179. String string_toString(const T& source) {
  180. return string_toStringIndented(source, U"");
  181. }
  182. template<typename T>
  183. std::ostream& string_toStreamIndented(std::ostream& target, const T& source, const ReadableString& indentation) {
  184. return target << string_toStringIndented(source, indentation);
  185. }
  186. template<typename T>
  187. std::ostream& string_toStream(std::ostream& target, const T& source) {
  188. return target << string_toString(source);
  189. }
  190. // ---------------- Procedural API ----------------
  191. // Post-condition: Returns the length of source.
  192. // Example: string_length(U"ABC") == 3
  193. int string_length(const ReadableString& source);
  194. // Post-condition: Returns the base-zero index of source's first occurence of toFind, starting from startIndex. Returns -1 if not found.
  195. // Example: string_findFirst(U"ABCABCABC", U'A') == 0
  196. // Example: string_findFirst(U"ABCABCABC", U'B') == 1
  197. // Example: string_findFirst(U"ABCABCABC", U'C') == 2
  198. // Example: string_findFirst(U"ABCABCABC", U'D') == -1
  199. int string_findFirst(const ReadableString& source, DsrChar toFind, int startIndex = 0);
  200. // Post-condition: Returns the base-zero index of source's last occurence of toFind. Returns -1 if not found.
  201. // Example: string_findLast(U"ABCABCABC", U'A') == 6
  202. // Example: string_findLast(U"ABCABCABC", U'B') == 7
  203. // Example: string_findLast(U"ABCABCABC", U'C') == 8
  204. // Example: string_findLast(U"ABCABCABC", U'D') == -1
  205. int string_findLast(const ReadableString& source, DsrChar toFind);
  206. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to before the character at exclusiveEnd
  207. // Example: string_exclusiveRange(U"0123456789", 2, 4) == U"23"
  208. ReadableString string_exclusiveRange(const ReadableString& source, int inclusiveStart, int exclusiveEnd);
  209. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to after the character at inclusiveEnd
  210. // Example: string_inclusiveRange(U"0123456789", 2, 4) == U"234"
  211. ReadableString string_inclusiveRange(const ReadableString& source, int inclusiveStart, int inclusiveEnd);
  212. // Post-condition: Returns a sub-string of source from the start to before the character at exclusiveEnd
  213. // Example: string_before(U"0123456789", 5) == U"01234"
  214. ReadableString string_before(const ReadableString& source, int exclusiveEnd);
  215. // Post-condition: Returns a sub-string of source from the start to after the character at inclusiveEnd
  216. // Example: string_until(U"0123456789", 5) == U"012345"
  217. ReadableString string_until(const ReadableString& source, int inclusiveEnd);
  218. // Post-condition: Returns a sub-string of source from before the character at inclusiveStart to the end
  219. // Example: string_from(U"0123456789", 5) == U"56789"
  220. ReadableString string_from(const ReadableString& source, int inclusiveStart);
  221. // Post-condition: Returns a sub-string of source from after the character at exclusiveStart to the end
  222. // Example: string_after(U"0123456789", 5) == U"6789"
  223. ReadableString string_after(const ReadableString& source, int exclusiveStart);
  224. // Post-condition:
  225. // Returns a list of strings from source by splitting along separator.
  226. // The separating characters are excluded from the resulting strings.
  227. // The number of strings returned in the list will equal the number of separating characters plus one, so the result may contain empty strings.
  228. // Each string in the list reuses memory from the input string using reference counting, but the list itself will be allocated.
  229. List<ReadableString> string_split(const ReadableString& source, DsrChar separator);
  230. // Use string_split_inPlace instead of string_split if you want to reuse the memory of an existing list.
  231. // It will then only allocate when running out of buffer space.
  232. // Side-effects:
  233. // Fills the target list with strings from source by splitting along separator.
  234. // If appendResult is false (default), any pre-existing elements in the target list will be cleared before writing the result.
  235. // If appendResult is true, the result is appended to the existing target list.
  236. void string_split_inPlace(List<ReadableString> &target, const ReadableString& source, DsrChar separator, bool appendResult = false);
  237. // Post-condition: Returns true iff c is a digit.
  238. // Digit <- '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  239. bool character_isDigit(DsrChar c);
  240. // Post-condition: Returns true iff c is an integer character.
  241. // IntegerCharacter <- '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  242. bool character_isIntegerCharacter(DsrChar c);
  243. // Post-condition: Returns true iff c is a value character.
  244. // ValueCharacter <- '.' | '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
  245. bool character_isValueCharacter(DsrChar c);
  246. // Post-condition: Returns true iff c is a white-space character.
  247. // WhiteSpace <- ' ' | '\t' | '\v' | '\f' | '\n' | '\r'
  248. // Null terminators are excluded, because it's reserved for out of bound results.
  249. bool character_isWhiteSpace(DsrChar c);
  250. // Post-condition: Returns true iff source is a valid integer. IntegerAllowingWhiteSpace is also allowed iff allowWhiteSpace is true.
  251. // UnsignedInteger <- Digit+
  252. // Integer <- UnsignedInteger | '-' UnsignedInteger
  253. // IntegerAllowingWhiteSpace <- WhiteSpace* Integer WhiteSpace*
  254. bool string_isInteger(const ReadableString& source, bool allowWhiteSpace = true);
  255. // Post-condition: Returns true iff source is a valid integer or decimal number. DoubleAllowingWhiteSpace is also allowed iff allowWhiteSpace is true.
  256. // UnsignedDouble <- Digit+ | Digit* '.' Digit+
  257. // Double <- UnsignedDouble | '-' UnsignedDouble
  258. // DoubleAllowingWhiteSpace <- WhiteSpace* Double WhiteSpace*
  259. // Only dots are allowed as decimals.
  260. // Because being able to read files from another country without crashes is a lot more important than a detail that most people don't even notice.
  261. // Automatic nationalization made sense when most applications were written in-house before the internet existed.
  262. bool string_isDouble(const ReadableString& source, bool allowWhiteSpace = true);
  263. // Pre-condition: source must be a valid integer according to string_isInteger. Otherwise unexpected characters are simply ignored.
  264. // Post-condition: Returns the integer representation of source.
  265. // The result is signed, because the input might unexpectedly have a negation sign.
  266. // The result is large, so that one can easily check the range before assigning to a smaller integer type.
  267. int64_t string_toInteger(const ReadableString& source);
  268. // Pre-condition: source must be a valid double according to string_isDouble. Otherwise unexpected characters are simply ignored.
  269. // Post-condition: Returns the double precision floating-point representation of source.
  270. double string_toDouble(const ReadableString& source);
  271. // Loading will try to find a byte order mark and can handle UTF-8 and UTF-16.
  272. // Failure to find a byte order mark will assume that the file's content is raw Latin-1,
  273. // because automatic detection would cause random behaviour.
  274. // For portability, carriage return characters are removed,
  275. // but will be generated again using the default CrLf line encoding of string_save.
  276. // Post-condition:
  277. // Returns the content of the file referred to be filename.
  278. // If mustExist is true, then failure to load will throw an exception.
  279. // If mustExist is false, then failure to load will return an empty string.
  280. String string_load(const ReadableString& filename, bool mustExist = true);
  281. // A version loading the text from a binary representation of the file's content instead of the filename.
  282. // Makes it easier to test character encoding and load arbitrary files from archives.
  283. String string_loadFromMemory(Buffer fileContent);
  284. // Side-effect: Saves content to filename using the selected character and line encodings.
  285. // Do not add carriage return characters yourself into strings, for these will be added automatically in the CrLf mode.
  286. // The internal String type should only use UTF-32 with single line feeds for breaking lines.
  287. // This makes text processing algorithms a lot cleaner when a character or line break is always one element.
  288. // UTF-8 with BOM is default by being both compact and capable of storing 21 bits of unicode
  289. void string_save(const ReadableString& filename, const ReadableString& content,
  290. CharacterEncoding characterEncoding = CharacterEncoding::BOM_UTF8,
  291. LineEncoding lineEncoding = LineEncoding::CrLf
  292. );
  293. // A version encoding the text to a new buffer
  294. Buffer string_saveToMemory(const ReadableString& content,
  295. CharacterEncoding characterEncoding = CharacterEncoding::BOM_UTF8,
  296. LineEncoding lineEncoding = LineEncoding::CrLf
  297. );
  298. // Post-condition: Returns true iff strings a and b are exactly equal.
  299. bool string_match(const ReadableString& a, const ReadableString& b);
  300. // Post-condition: Returns true iff strings a and b are roughly equal using a case insensitive match.
  301. bool string_caseInsensitiveMatch(const ReadableString& a, const ReadableString& b);
  302. // Post-condition: Returns text converted to upper case.
  303. String string_upperCase(const ReadableString &text);
  304. // Post-condition: Returns text converted to lower case.
  305. String string_lowerCase(const ReadableString &text);
  306. // Post-condition: Returns a clone of text without any white-space (space, tab, carriage-return, null terminator, et cetera).
  307. String string_removeAllWhiteSpace(const ReadableString &text);
  308. // Post-condition: Returns a sub-set of text without surrounding white-space (space, tab and carriage-return).
  309. // Unlike string_removeAllWhiteSpace, string_removeOuterWhiteSpace does not require allocating a new buffer.
  310. ReadableString string_removeOuterWhiteSpace(const ReadableString &text);
  311. // Post-condition: Returns rawText wrapped in a quote.
  312. // Special characters are included using escape characters, so that one can quote multiple lines but store it easily.
  313. String string_mangleQuote(const ReadableString &rawText);
  314. // Pre-condition: mangledText must be enclosed in double quotes and special characters must use escape characters (tabs in quotes are okay though).
  315. // Post-condition: Returns mangledText with quotes removed and excape tokens interpreted.
  316. String string_unmangleQuote(const ReadableString& mangledText);
  317. // Ensures safely that at least minimumLength characters can he held in the buffer
  318. inline void string_reserve(String& target, int32_t minimumLength) {
  319. target.reserve(minimumLength);
  320. }
  321. // Append/push one character (to avoid integer to string conversion)
  322. inline void string_appendChar(String& target, DsrChar value) {
  323. target.appendChar(value);
  324. }
  325. // Append one element
  326. template<typename TYPE>
  327. inline void string_append(String& target, TYPE value) {
  328. string_toStream(target, value);
  329. }
  330. // Append multiple elements
  331. template<typename HEAD, typename... TAIL>
  332. inline void string_append(String& target, HEAD head, TAIL... tail) {
  333. string_append(target, head);
  334. string_append(target, tail...);
  335. }
  336. // Combine a number of strings, characters and numbers
  337. // If an input type is rejected, create a Printable object to wrap around it
  338. template<typename... ARGS>
  339. inline String string_combine(ARGS... args) {
  340. String result;
  341. string_append(result, args...);
  342. return result;
  343. }
  344. // ---------------- Infix syntax ----------------
  345. // Operations
  346. inline String operator+ (const ReadableString& a, const ReadableString& b) { return string_combine(a, b); }
  347. inline String operator+ (const char32_t* a, const ReadableString& b) { return string_combine(a, b); }
  348. inline String operator+ (const ReadableString& a, const char32_t* b) { return string_combine(a, b); }
  349. inline String operator+ (const String& a, const String& b) { return string_combine(a, b); }
  350. inline String operator+ (const char32_t* a, const String& b) { return string_combine(a, b); }
  351. inline String operator+ (const String& a, const char32_t* b) { return string_combine(a, b); }
  352. inline String operator+ (const String& a, const ReadableString& b) { return string_combine(a, b); }
  353. inline String operator+ (const ReadableString& a, const String& b) { return string_combine(a, b); }
  354. // Methods used so often that they don't need to use the string_ prefix
  355. // Print information
  356. template<typename... ARGS>
  357. void printText(ARGS... args) {
  358. String result = string_combine(args...);
  359. result.toStream(std::cout);
  360. }
  361. // Use for text printing that are useful when debugging but should not be given out in a release
  362. #ifdef NDEBUG
  363. // Supress debugText in release mode
  364. template<typename... ARGS>
  365. void debugText(ARGS... args) {}
  366. #else
  367. // Print debugText in debug mode
  368. template<typename... ARGS>
  369. void debugText(ARGS... args) { printText(args...); }
  370. #endif
  371. // Raise an exception
  372. // Only catch errors to display useful error messages, emergency backups or crash logs before terminating
  373. // Further execution after a partial transaction will break object invariants
  374. void throwErrorMessage(const String& message);
  375. template<typename... ARGS>
  376. void throwError(ARGS... args) {
  377. String result = string_combine(args...);
  378. throwErrorMessage(result);
  379. }
  380. // ---------------- Hard-coded portability for specific operating systems ----------------
  381. // TODO: Try to find a place for this outside of the library, similar to how window managers were implemented
  382. // Get a path separator for the target operating system.
  383. const char32_t* file_separator();
  384. }
  385. #endif