5 years ago · b8c3235ee4
--- a/Source/DFPSR/base/Buffer.h
+++ b/Source/DFPSR/base/Buffer.h
@@ -44,16 +44,20 @@ public:
 
															 	~Buffer();
														
 
															 public:
														
 
															 	void replaceDestructor(const std::function<void(uint8_t *)>& newDestructor);
														
 
															+	// Set all bytes to the same value
														
 
															 	void set(uint8_t value);
														
 
															+	// Get a dangerous pointer to the raw data
														
 
															 	uint8_t *getUnsafeData() {
														
 
															 		return this->data;
														
 
															 	}
														
 
															-	// Get the buffer
														
 
															+	const uint8_t *getUnsafeData() const {
														
 
															+		return this->data;
														
 
															+	}
														
 
															+	// Get a safe pointer to the raw data
														
 
															 	template <typename T>
														
 
															 	SafePointer<T> getSafeData(const char *name) {
														
 
															 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);
														
 
															 	}
														
 
															-	// Get the buffer
														
 
															 	template <typename T>
														
 
															 	const SafePointer<T> getSafeData(const char *name) const {
														
 
															 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);
														
--- a/Source/DFPSR/base/text.cpp
+++ b/Source/DFPSR/base/text.cpp
@@ -333,6 +333,7 @@ static void doubleToString_arabic(String& target, double value) {
 
															 	} \
														
 
															 	TARGET[SOURCE.length()] = '\0';
														
 
															+/*
														
 
															 String dsr::string_load(const ReadableString& filename, bool mustExist) {
														
 
															 	// TODO: Load files using Unicode filenames
														
 
															 	TO_RAW_ASCII(asciiFilename, filename);
														
@@ -357,6 +358,128 @@ String dsr::string_load(const ReadableString& filename, bool mustExist) {
 
															 		return String();
														
 
															 	}
														
 
															 }
														
 
															+*/
														
 
															+
														
 
															+// TODO: Give as a lambda with target captured, so that pre-allocation can measure the
														
 
															+//       needed space exactly using a lambda that increases a character counter instead.
														
 
															+// Interpreting a character's value and appends it to the string.
														
 
															+static void feedCharacterFromFile(String &target, DsrChar character) {
														
 
															+	if (character != U'\r') {
														
 
															+		target.appendChar(character);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// Appends the content of buffer as a BOM-free Latin-1 file into target
														
 
															+static void AppendStringFromFileBuffer_Latin1(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															+	for (int64_t i = 0; i < fileLength; i++) {
														
 
															+		feedCharacterFromFile(target, (DsrChar)(buffer[i]));
														
 
															+	}
														
 
															+}
														
 
															+// Appends the content of buffer as a BOM-free UTF-8 file into target
														
 
															+static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															+	// We know that the result will be at least one character per given byte for UTF-8
														
 
															+	target.reserve(string_length(target) + fileLength);
														
 
															+	for (int64_t i = 0; i < fileLength; i++) {
														
 
															+		uint8_t byteA = buffer[i];
														
 
															+		if (byteA < 0b10000000) {
														
 
															+			// Single byte (1xxxxxxx)
														
 
															+			feedCharacterFromFile(target, (DsrChar)byteA);
														
 
															+		} else {
														
 
															+			uint32_t character = 0;
														
 
															+			int extraBytes = 0;
														
 
															+			if (byteA >= 0b11000000) { // At least two leading ones
														
 
															+				if (byteA < 0b11100000) { // Less than three leading ones
														
 
															+					character = byteA & 0b00011111;
														
 
															+					extraBytes = 1;
														
 
															+				} else if (byteA < 0b11110000) { // Less than four leading ones
														
 
															+					character = byteA & 0b00011111;
														
 
															+					extraBytes = 2;
														
 
															+				} else if (byteA < 0b11111000) { // Less than five leading ones
														
 
															+					character = byteA & 0b00011111;
														
 
															+					extraBytes = 3;
														
 
															+				} else {
														
 
															+					// Invalid UTF-8 format
														
 
															+					throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b111111xx!");
														
 
															+				}
														
 
															+			} else {
														
 
															+				// Invalid UTF-8 format
														
 
															+				throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b10xxxxxx!");
														
 
															+			}
														
 
															+			while (extraBytes > 0) {
														
 
															+				i += 1; uint32_t nextByte = buffer[i];
														
 
															+				character = (character << 6) | (nextByte & 0b00111111);
														
 
															+				extraBytes--;
														
 
															+			}
														
 
															+			feedCharacterFromFile(target, (DsrChar)character);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+// Appends the content of buffer as a text file of unknown format into target
														
 
															+static void AppendStringFromFileBuffer(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															+	// After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
														
 
															+	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
														
 
															+		AppendStringFromFileBuffer_UTF8(target, buffer + 3, fileLength - 3);
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) {
														
 
															+		//AppendStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
														
 
															+		throwError(U"UTF-1 format is not yet supported!");
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) {
														
 
															+		//AppendStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
														
 
															+		throwError(U"SCSU format is not yet supported!");
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) {
														
 
															+		//AppendStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
														
 
															+		throwError(U"BOCU-1 format is not yet supported!");
														
 
															+	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) {
														
 
															+		//AppendStringFromFileBuffer_UTF16BE(target, buffer + 2, fileLength - 2);
														
 
															+		throwError(U"UTF-16 BE format is not yet supported!");
														
 
															+	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) {
														
 
															+		//AppendStringFromFileBuffer_UTF16LE(target, buffer + 2, fileLength - 2);
														
 
															+		throwError(U"UTF-16 LE format is not yet supported!");
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) {
														
 
															+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															+		throwError(U"UTF-32 BE format is not yet supported!");
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) {
														
 
															+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															+		throwError(U"UTF-32 LE format is not yet supported!");
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) {
														
 
															+		// Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
														
 
															+		throwError(U"UTF-7 format is not yet supported!");
														
 
															+	} else {
														
 
															+		// No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
														
 
															+		AppendStringFromFileBuffer_Latin1(target, buffer, fileLength);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+String dsr::string_loadFromMemory(const Buffer &fileContent) {
														
 
															+	String result;
														
 
															+	AppendStringFromFileBuffer(result, fileContent.getUnsafeData(), fileContent.size);
														
 
															+	return result;
														
 
															+}
														
 
															+
														
 
															+// Loads a text file of unknown format
														
 
															+//   Removes carriage-return characters to make processing easy with only line-feed for breaking lines.
														
 
															+String dsr::string_load(const ReadableString& filename, bool mustExist) {
														
 
															+	// TODO: Load files using Unicode filenames when available
														
 
															+	TO_RAW_ASCII(asciiFilename, filename);
														
 
															+	std::ifstream fileStream(asciiFilename, std::ios_base::in | std::ios_base::binary);
														
 
															+	if (fileStream.is_open()) {
														
 
															+		String result;
														
 
															+		// Get the file's length and allocate an array for the raw encoding
														
 
															+		fileStream.seekg (0, fileStream.end);
														
 
															+		int64_t fileLength = fileStream.tellg();
														
 
															+		fileStream.seekg (0, fileStream.beg);
														
 
															+		uint8_t* buffer = (uint8_t*)malloc(fileLength);
														
 
															+		fileStream.read((char*)buffer, fileLength);
														
 
															+		AppendStringFromFileBuffer(result, buffer, fileLength);
														
 
															+		free(buffer);
														
 
															+		return result;
														
 
															+	} else {
														
 
															+		if (mustExist) {
														
 
															+			throwError(U"The text file ", filename, U" could not be opened for reading.\n");
														
 
															+		}
														
 
															+		// If the file cound not be found and opened, a null string is returned
														
 
															+		return String();
														
 
															+	}
														
 
															+}
														
 
															 void dsr::string_save(const ReadableString& filename, const ReadableString& content) {
														
 
															 	// TODO: Load files using Unicode filenames
														
@@ -371,6 +494,27 @@ void dsr::string_save(const ReadableString& filename, const ReadableString& cont
 
															 		throwError("Failed to save ", filename, "\n");
														
 
															 	}
														
 
															 }
														
 
															+/*
														
 
															+// TODO: Choose how to encode characters and line endings using enums
														
 
															+class enum textEncoding {
														
 
															+	UTF1, UTF7, UTF8, UTF16BE, UTF16LE, UTF32BE, UTF32LE, UTF-EBCDIC, SCSU, BOCU1, GB18030
														
 
															+};
														
 
															+* class enum lineEncoding {
														
 
															+	UTF1, UTF7, UTF8, UTF16BE, UTF16LE, UTF32BE, UTF32LE, UTF-EBCDIC, SCSU, BOCU1, GB18030
														
 
															+};
														
 
															+void dsr::string_save(const ReadableString& filename, const ReadableString& content) {
														
 
															+	// TODO: Load files using Unicode filenames
														
 
															+	TO_RAW_ASCII(asciiFilename, filename);
														
 
															+	TO_RAW_ASCII(asciiContent, content);
														
 
															+	std::ofstream fileStream(asciiFilename, std::ios_base::out | std::ios_base::binary);
														
 
															+	if (fileStream.is_open()) {
														
 
															+		fileStream << asciiContent;
														
 
															+		fileStream.close();
														
 
															+	} else {
														
 
															+		throwError("Failed to save ", filename, "\n");
														
 
															+	}
														
 
															+}
														
 
															+*/
														
 
															 const char32_t* dsr::file_separator() {
														
 
															 	#ifdef _WIN32
														
--- a/Source/DFPSR/base/text.h
+++ b/Source/DFPSR/base/text.h
@@ -278,6 +278,10 @@ double string_toDouble(const ReadableString& source);
 
															 //   If mustExist is true, then failure to load will throw an exception.
														
 
															 //   If mustExist is false, then failure to load will return an empty string.
														
 
															 String string_load(const ReadableString& filename, bool mustExist = true);
														
 
															+// A version loading the text from a binary representation of the file's content instead of the filename
														
 
															+//   Makes it easier to test character encoding and load arbitrary files from archives.
														
 
															+String string_loadFromMemory(const Buffer &fileContent);
														
 
															+
														
 
															 // Side-effect: Saves content to filename.
														
 
															 void string_save(const ReadableString& filename, const ReadableString& content);
														
--- a/Source/test/tests/TextEncodingTest.cpp
+++ b/Source/test/tests/TextEncodingTest.cpp
@@ -0,0 +1,103 @@
 
															+
														
 
															+#include "../testTools.h"
														
 
															+
														
 
															+String expected_latin1 =
														
 
															+R"QUOTE(Hello my friend.
														
 
															+Hej min vän
														
 
															+Halló, vinur minn
														
 
															+Hei ystäväni
														
 
															+Hola mi amigo
														
 
															+Ciao amico
														
 
															+
														
 
															+This is Latin-1)QUOTE";
														
 
															+
														
 
															+String unicodeContent =
														
 
															+R"QUOTE(Hello my friend.
														
 
															+Hej min vän
														
 
															+Halló, vinur minn
														
 
															+Hei ystäväni
														
 
															+Hola mi amigo
														
 
															+Ciao amico
														
 
															+你好我的朋友
														
 
															+こんにちは、友よ
														
 
															+नमस्ते मेरो साथी
														
 
															+Talofa laʻu uo
														
 
															+Xin chào bạn của tôi
														
 
															+העלא מיין פרייַנד
														
 
															+안녕 내 친구
														
 
															+سلام دوست من
														
 
															+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
														
 
															+ওহে, বন্ধু আমার
														
 
															+សួស្តីសម្លាញ់
														
 
															+Γεια σου φίλε μου
														
 
															+Привет, мой друг
														
 
															+здраво пријатељу
														
 
															+Բարեւ իմ ընկեր
														
 
															+ආයුබෝවන් මාගේ යාළුවා
														
 
															+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
														
 
															+Silav hevalê min
														
 
															+اهلا صديقي
														
 
															+)QUOTE";
														
 
															+String expected_utf8 = unicodeContent + U"\nThis is UTF-8";
														
 
															+String expected_utf16le = unicodeContent + U"\nThis is UTF-16 Little Endian";
														
 
															+String expected_utf16be = unicodeContent + U"\nThis is UTF-8 Big Endian";
														
 
															+
														
 
															+void printCharacterCode(uint32_t value) {
														
 
															+	for (int i = 0; i < 32; i++) {
														
 
															+		if (value & 0b10000000000000000000000000000000) {
														
 
															+			printText(U"1");
														
 
															+		} else {
														
 
															+			printText(U"0");
														
 
															+		}
														
 
															+		value = value << 1;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// Method for printing the character codes of a string for debugging
														
 
															+void compareCharacterCodes(String textA, String textB) {
														
 
															+	int lengthA = string_length(textA);
														
 
															+	int lengthB = string_length(textB);
														
 
															+	int minLength = lengthA < lengthB ? lengthA : lengthB;
														
 
															+	printText("Character codes for strings of length ", lengthA, U" and ", lengthB, U":\n");
														
 
															+	for (int i = 0; i < minLength; i++) {
														
 
															+		uint32_t codeA = (uint32_t)textA[i];
														
 
															+		uint32_t codeB = (uint32_t)textB[i];
														
 
															+		printCharacterCode(codeA);
														
 
															+		if (codeA == codeB) {
														
 
															+			printText(U" == ");
														
 
															+		} else {
														
 
															+			printText(U" != ");
														
 
															+		}
														
 
															+		printCharacterCode(codeB);
														
 
															+		printText(U" (", textA[i], U") (", textB[i], U")\n");
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+START_TEST(TextEncoding)
														
 
															+	{ // Text encodings stored in memory
														
 
															+		// TODO: Test string_loadFromMemory
														
 
															+		
														
 
															+		
														
 
															+	}
														
 
															+	{ // Loading strings of different encodings
														
 
															+		String folderPath = string_combine(U"test", file_separator(), U"tests", file_separator(), U"resources", file_separator());
														
 
															+
														
 
															+		String fileLatin1 = string_load(folderPath + U"Latin1.txt", true);
														
 
															+		printText("Latin1.txt contains:\n", fileLatin1, "\n");
														
 
															+		compareCharacterCodes(fileLatin1, expected_latin1);
														
 
															+		ASSERT_MATCH(fileLatin1, expected_latin1);
														
 
															+
														
 
															+		String fileUTF8 = string_load(folderPath + U"BomUtf8.txt", true);
														
 
															+		printText("BomUtf8.txt contains:\n", fileUTF8, "\n");
														
 
															+		compareCharacterCodes(fileUTF8, expected_utf8);
														
 
															+		ASSERT_MATCH(fileUTF8, expected_utf8);
														
 
															+
														
 
															+		//String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
														
 
															+		//printText("BomUtf16Le.txt contains:\n", fileUTF16LE, "\n");
														
 
															+		//ASSERT_MATCH(fileUTF16LE, expected_utf16le);
														
 
															+
														
 
															+		//String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
														
 
															+		//printText("BomUtf16Be.txt contains:\n", fileUTF16BE, "\n");
														
 
															+		//ASSERT_MATCH(fileUTF16BE, expected_utf16be);
														
 
															+	}
														
 
															+END_TEST
														
--- a/Source/test/tests/resources/BomUtf16Be.txt
+++ b/Source/test/tests/resources/BomUtf16Be.txt
--- a/Source/test/tests/resources/BomUtf16Le.txt
+++ b/Source/test/tests/resources/BomUtf16Le.txt
--- a/Source/test/tests/resources/BomUtf8.txt
+++ b/Source/test/tests/resources/BomUtf8.txt
@@ -0,0 +1,27 @@
 
															+Hello my friend.
														
 
															+Hej min vän
														
 
															+Halló, vinur minn
														
 
															+Hei ystäväni
														
 
															+Hola mi amigo
														
 
															+Ciao amico
														
 
															+你好我的朋友
														
 
															+こんにちは、友よ
														
 
															+नमस्ते मेरो साथी
														
 
															+Talofa laʻu uo
														
 
															+Xin chào bạn của tôi
														
 
															+העלא מיין פרייַנד
														
 
															+안녕 내 친구
														
 
															+سلام دوست من
														
 
															+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
														
 
															+ওহে, বন্ধু আমার
														
 
															+សួស្តីសម្លាញ់
														
 
															+Γεια σου φίλε μου
														
 
															+Привет, мой друг
														
 
															+здраво пријатељу
														
 
															+Բարեւ իմ ընկեր
														
 
															+ආයුබෝවන් මාගේ යාළුවා
														
 
															+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
														
 
															+Silav hevalê min
														
 
															+اهلا صديقي
														
 
															+
														
 
															+This is UTF-8
														
--- a/Source/test/tests/resources/Latin1.txt
+++ b/Source/test/tests/resources/Latin1.txt
@@ -0,0 +1,8 @@
 
															+Hello my friend.
														
 
															+Hej min vän
														
 
															+Halló, vinur minn
														
 
															+Hei ystäväni
														
 
															+Hola mi amigo
														
 
															+Ciao amico
														
 
															+
														
 
															+This is Latin-1