Browse Source

Pre-allocating strings when reading text files using lambda streams.

David Piuva 5 years ago
parent
commit
d3a439b690
2 changed files with 69 additions and 37 deletions
  1. 59 35
      Source/DFPSR/base/text.cpp
  2. 10 2
      Source/DFPSR/base/text.h

+ 59 - 35
Source/DFPSR/base/text.cpp

@@ -333,30 +333,37 @@ static void doubleToString_arabic(String& target, double value) {
 	} \
 	TARGET[SOURCE.length()] = '\0';
 
-// TODO: Give as a lambda with target captured, so that pre-allocation can measure the
-//       needed space exactly using a lambda that increases a character counter instead.
-// Interpreting a character's value and appends it to the string.
-static void feedCharacterFromFile(String &target, DsrChar character) {
+static inline void byteToStream(std::ostream &target, uint8_t value) {
+	target.write((const char*)&value, 1);
+}
+
+// A function definition for receiving a stream of UTF-32 characters
+//   Instead of using std's messy inheritance
+using UTF32WriterFunction = std::function<void(DsrChar character)>;
+
+// Filter out unwanted characters for improved portability
+static void feedCharacter(const UTF32WriterFunction &reciever, DsrChar character) {
 	if (character != U'\r') {
-		target.appendChar(character);
+		reciever(character);
 	}
 }
 
 // Appends the content of buffer as a BOM-free Latin-1 file into target
-static void AppendStringFromFileBuffer_Latin1(String &target, const uint8_t* buffer, int64_t fileLength) {
+static void feedStringFromFileBuffer_Latin1(const UTF32WriterFunction &reciever, const uint8_t* buffer, int64_t fileLength) {
 	for (int64_t i = 0; i < fileLength; i++) {
-		feedCharacterFromFile(target, (DsrChar)(buffer[i]));
+		DsrChar character = (DsrChar)(buffer[i]);
+		if (character != U'\r') {
+			feedCharacter(reciever, character);
+		}
 	}
 }
 // Appends the content of buffer as a BOM-free UTF-8 file into target
-static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffer, int64_t fileLength) {
-	// We know that the result will be at most one character per given byte for UTF-8
-	target.reserve(string_length(target) + fileLength);
+static void feedStringFromFileBuffer_UTF8(const UTF32WriterFunction &reciever, const uint8_t* buffer, int64_t fileLength) {
 	for (int64_t i = 0; i < fileLength; i++) {
 		uint8_t byteA = buffer[i];
 		if (byteA < 0b10000000) {
 			// Single byte (1xxxxxxx)
-			feedCharacterFromFile(target, (DsrChar)byteA);
+			feedCharacter(reciever, (DsrChar)byteA);
 		} else {
 			uint32_t character = 0;
 			int extraBytes = 0;
@@ -383,7 +390,7 @@ static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffe
 				character = (character << 6) | (nextByte & 0b00111111);
 				extraBytes--;
 			}
-			feedCharacterFromFile(target, (DsrChar)character);
+			feedCharacter(reciever, (DsrChar)character);
 		}
 	}
 }
@@ -401,9 +408,7 @@ uint16_t read16bits(const uint8_t* buffer, int startOffset) {
 
 // Appends the content of buffer as a BOM-free UTF-16 file into target
 template <bool LittleEndian>
-static void AppendStringFromFileBuffer_UTF16(String &target, const uint8_t* buffer, int64_t fileLength) {
-	// We know that the result will be at most one character per two given bytes for UTF-16
-	target.reserve(string_length(target) + (fileLength / 2));
+static void feedStringFromFileBuffer_UTF16(const UTF32WriterFunction &reciever, const uint8_t* buffer, int64_t fileLength) {
 	for (int64_t i = 0; i < fileLength; i += 2) {
 		// Read the first 16-bit word
 		uint16_t wordA = read16bits<LittleEndian>(buffer, i);
@@ -412,58 +417,70 @@ static void AppendStringFromFileBuffer_UTF16(String &target, const uint8_t* buff
 		//   we can just check if it's within the range reserved for 32-bit encoding
 		if (wordA <= 0xD7FF || wordA >= 0xE000) {
 			// Not in the reserved range, just a single 16-bit character
-			feedCharacterFromFile(target, (DsrChar)wordA);
+			feedCharacter(reciever, (DsrChar)wordA);
 		} else {
 			// The given range was reserved and therefore using 32 bits
 			i += 2;
 			uint16_t wordB = read16bits<LittleEndian>(buffer, i);
 			uint32_t higher10Bits = wordA & 0b1111111111;
 			uint32_t lower10Bits = wordB & 0b1111111111;
-			feedCharacterFromFile(target, (DsrChar)(((higher10Bits << 10) | lower10Bits) + 0x10000));
+			feedCharacter(reciever, (DsrChar)(((higher10Bits << 10) | lower10Bits) + 0x10000));
 		}
 	}
 }
 // Appends the content of buffer as a text file of unknown format into target
-static void AppendStringFromFileBuffer(String &target, const uint8_t* buffer, int64_t fileLength) {
+static void feedStringFromFileBuffer(const UTF32WriterFunction &reciever, const uint8_t* buffer, int64_t fileLength) {
 	// After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
 	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
-		AppendStringFromFileBuffer_UTF8(target, buffer + 3, fileLength - 3);
+		feedStringFromFileBuffer_UTF8(reciever, buffer + 3, fileLength - 3);
 	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
-		AppendStringFromFileBuffer_UTF16<false>(target, buffer + 2, fileLength - 2);
+		feedStringFromFileBuffer_UTF16<false>(reciever, buffer + 2, fileLength - 2);
 	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
-		AppendStringFromFileBuffer_UTF16<true>(target, buffer + 2, fileLength - 2);
+		feedStringFromFileBuffer_UTF16<true>(reciever, buffer + 2, fileLength - 2);
 	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
+		//feedStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
 		throwError(U"UTF-32 BE format is not yet supported!\n");
 	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
+		//feedStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
 		throwError(U"UTF-32 LE format is not yet supported!\n");
 	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
-		//AppendStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
+		//feedStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
 		throwError(U"UTF-1 format is not yet supported!\n");
 	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
-		//AppendStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
+		//feedStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
 		throwError(U"SCSU format is not yet supported!\n");
 	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
-		//AppendStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
+		//feedStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
 		throwError(U"BOCU-1 format is not yet supported!\n");
 	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
 		// Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
 		throwError(U"UTF-7 format is not yet supported!\n");
 	} else {
 		// No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
-		AppendStringFromFileBuffer_Latin1(target, buffer, fileLength);
+		feedStringFromFileBuffer_Latin1(reciever, buffer, fileLength);
 	}
 }
 
 String dsr::string_loadFromMemory(const Buffer &fileContent) {
 	String result;
-	AppendStringFromFileBuffer(result, fileContent.getUnsafeData(), fileContent.size);
+	// Measure the size of the result by scanning the content in advance
+	int64_t characterCount = 0;
+	UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
+		characterCount++;
+	};
+	feedStringFromFileBuffer(measurer, fileContent.getUnsafeData(), fileContent.size);
+	// Pre-allocate the correct amount of memory based on the simulation
+	result.reserve(characterCount);
+	// Stream output to the result string
+	UTF32WriterFunction reciever = [&result](DsrChar character) {
+		result.appendChar(character);
+	};
+	feedStringFromFileBuffer(reciever, fileContent.getUnsafeData(), fileContent.size);
 	return result;
 }
 
 // Loads a text file of unknown format
-//   Removes carriage-return characters to make processing easy with only line-feed for breaking lines.
+//   Removes carriage-return characters to make processing easy with only line-feed for breaking lines
 String dsr::string_load(const ReadableString& filename, bool mustExist) {
 	// TODO: Load files using Unicode filenames when available
 	TO_RAW_ASCII(asciiFilename, filename);
@@ -476,7 +493,19 @@ String dsr::string_load(const ReadableString& filename, bool mustExist) {
 		fileStream.seekg (0, fileStream.beg);
 		uint8_t* buffer = (uint8_t*)malloc(fileLength);
 		fileStream.read((char*)buffer, fileLength);
-		AppendStringFromFileBuffer(result, buffer, fileLength);
+		// Measure the size of the result by scanning the content in advance
+		int64_t characterCount = 0;
+		UTF32WriterFunction measurer = [&characterCount](DsrChar character) {
+			characterCount++;
+		};
+		feedStringFromFileBuffer(measurer, buffer, fileLength);
+		// Pre-allocate the correct amount of memory based on the simulation
+		result.reserve(characterCount);
+		// Stream output to the result string
+		UTF32WriterFunction reciever = [&result](DsrChar character) {
+			result.appendChar(character);
+		};
+		feedStringFromFileBuffer(reciever, buffer, fileLength);
 		free(buffer);
 		return result;
 	} else {
@@ -488,11 +517,6 @@ String dsr::string_load(const ReadableString& filename, bool mustExist) {
 	}
 }
 
-static inline void byteToStream(std::ostream &target, int value) {
-	uint8_t byte = value;
-	target.write((char*)&byte, 1);
-}
-
 #define AT_MOST_BITS(BIT_COUNT) if (character >= 1 << BIT_COUNT) { character = U'?'; }
 
 template <CharacterEncoding characterEncoding>

+ 10 - 2
Source/DFPSR/base/text.h

@@ -289,16 +289,24 @@ int64_t string_toInteger(const ReadableString& source);
 // Post-condition: Returns the double precision floating-point representation of source.
 double string_toDouble(const ReadableString& source);
 
+// Loading will try to find a byte order mark and can handle UTF-8 and UTF-16.
+//   Failure to find a byte order mark will assume that the file's content is raw Latin-1,
+//   because automatic detection would cause random behaviour.
+// For portability, carriage return characters are removed,
+//   but will be generated again using the default CrLf line encoding of string_save.
 // Post-condition:
 //   Returns the content of the file referred to be filename.
 //   If mustExist is true, then failure to load will throw an exception.
 //   If mustExist is false, then failure to load will return an empty string.
 String string_load(const ReadableString& filename, bool mustExist = true);
-// A version loading the text from a binary representation of the file's content instead of the filename
+// A version loading the text from a binary representation of the file's content instead of the filename.
 //   Makes it easier to test character encoding and load arbitrary files from archives.
 String string_loadFromMemory(const Buffer &fileContent);
 
-// Side-effect: Saves content to filename.
+// Side-effect: Saves content to filename using the selected character and line encodings.
+// Do not add carriage return characters yourself into strings, for these will be added automatically in the CrLf mode.
+// The internal String type should only use UTF-32 with single line feeds for breaking lines.
+//   This makes text processing algorithms a lot cleaner when a character or line break is always one element.
 // UTF-8 with BOM is default by being both compact and capable of storing 21 bits of unicode
 void string_save(const ReadableString& filename, const ReadableString& content,
   CharacterEncoding characterEncoding = CharacterEncoding::BOM_UTF8,