Browse Source

Almost loading Latin-1 encoding.

David Piuva 5 years ago
parent
commit
b8c3235ee4

+ 6 - 2
Source/DFPSR/base/Buffer.h

@@ -44,16 +44,20 @@ public:
 	~Buffer();
 	~Buffer();
 public:
 public:
 	void replaceDestructor(const std::function<void(uint8_t *)>& newDestructor);
 	void replaceDestructor(const std::function<void(uint8_t *)>& newDestructor);
+	// Set all bytes to the same value
 	void set(uint8_t value);
 	void set(uint8_t value);
+	// Get a dangerous pointer to the raw data
 	uint8_t *getUnsafeData() {
 	uint8_t *getUnsafeData() {
 		return this->data;
 		return this->data;
 	}
 	}
-	// Get the buffer
+	const uint8_t *getUnsafeData() const {
+		return this->data;
+	}
+	// Get a safe pointer to the raw data
 	template <typename T>
 	template <typename T>
 	SafePointer<T> getSafeData(const char *name) {
 	SafePointer<T> getSafeData(const char *name) {
 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);
 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);
 	}
 	}
-	// Get the buffer
 	template <typename T>
 	template <typename T>
 	const SafePointer<T> getSafeData(const char *name) const {
 	const SafePointer<T> getSafeData(const char *name) const {
 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);
 		return SafePointer<T>(name, (T*)this->data, this->bufferSize, (T*)this->data);

+ 144 - 0
Source/DFPSR/base/text.cpp

@@ -333,6 +333,7 @@ static void doubleToString_arabic(String& target, double value) {
 	} \
 	} \
 	TARGET[SOURCE.length()] = '\0';
 	TARGET[SOURCE.length()] = '\0';
 
 
+/*
 String dsr::string_load(const ReadableString& filename, bool mustExist) {
 String dsr::string_load(const ReadableString& filename, bool mustExist) {
 	// TODO: Load files using Unicode filenames
 	// TODO: Load files using Unicode filenames
 	TO_RAW_ASCII(asciiFilename, filename);
 	TO_RAW_ASCII(asciiFilename, filename);
@@ -357,6 +358,128 @@ String dsr::string_load(const ReadableString& filename, bool mustExist) {
 		return String();
 		return String();
 	}
 	}
 }
 }
+*/
+
+// TODO: Give as a lambda with target captured, so that pre-allocation can measure the
+//       needed space exactly using a lambda that increases a character counter instead.
+// Interpreting a character's value and appends it to the string.
+static void feedCharacterFromFile(String &target, DsrChar character) {
+	if (character != U'\r') {
+		target.appendChar(character);
+	}
+}
+
+// Appends the content of buffer as a BOM-free Latin-1 file into target
+static void AppendStringFromFileBuffer_Latin1(String &target, const uint8_t* buffer, int64_t fileLength) {
+	for (int64_t i = 0; i < fileLength; i++) {
+		feedCharacterFromFile(target, (DsrChar)(buffer[i]));
+	}
+}
+// Appends the content of buffer as a BOM-free UTF-8 file into target
+static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffer, int64_t fileLength) {
+	// We know that the result will be at least one character per given byte for UTF-8
+	target.reserve(string_length(target) + fileLength);
+	for (int64_t i = 0; i < fileLength; i++) {
+		uint8_t byteA = buffer[i];
+		if (byteA < 0b10000000) {
+			// Single byte (1xxxxxxx)
+			feedCharacterFromFile(target, (DsrChar)byteA);
+		} else {
+			uint32_t character = 0;
+			int extraBytes = 0;
+			if (byteA >= 0b11000000) { // At least two leading ones
+				if (byteA < 0b11100000) { // Less than three leading ones
+					character = byteA & 0b00011111;
+					extraBytes = 1;
+				} else if (byteA < 0b11110000) { // Less than four leading ones
+					character = byteA & 0b00011111;
+					extraBytes = 2;
+				} else if (byteA < 0b11111000) { // Less than five leading ones
+					character = byteA & 0b00011111;
+					extraBytes = 3;
+				} else {
+					// Invalid UTF-8 format
+					throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b111111xx!");
+				}
+			} else {
+				// Invalid UTF-8 format
+				throwError(U"Invalid UTF-8 multi-chatacter beginning with 0b10xxxxxx!");
+			}
+			while (extraBytes > 0) {
+				i += 1; uint32_t nextByte = buffer[i];
+				character = (character << 6) | (nextByte & 0b00111111);
+				extraBytes--;
+			}
+			feedCharacterFromFile(target, (DsrChar)character);
+		}
+	}
+}
+// Appends the content of buffer as a text file of unknown format into target
+static void AppendStringFromFileBuffer(String &target, const uint8_t* buffer, int64_t fileLength) {
+	// After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
+	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
+		AppendStringFromFileBuffer_UTF8(target, buffer + 3, fileLength - 3);
+	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) {
+		//AppendStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
+		throwError(U"UTF-1 format is not yet supported!");
+	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) {
+		//AppendStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
+		throwError(U"SCSU format is not yet supported!");
+	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) {
+		//AppendStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
+		throwError(U"BOCU-1 format is not yet supported!");
+	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) {
+		//AppendStringFromFileBuffer_UTF16BE(target, buffer + 2, fileLength - 2);
+		throwError(U"UTF-16 BE format is not yet supported!");
+	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) {
+		//AppendStringFromFileBuffer_UTF16LE(target, buffer + 2, fileLength - 2);
+		throwError(U"UTF-16 LE format is not yet supported!");
+	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) {
+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
+		throwError(U"UTF-32 BE format is not yet supported!");
+	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) {
+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
+		throwError(U"UTF-32 LE format is not yet supported!");
+	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) {
+		// Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
+		throwError(U"UTF-7 format is not yet supported!");
+	} else {
+		// No BOM detected, assuming Latin-1 (because it directly corresponds to a unicode sub-set)
+		AppendStringFromFileBuffer_Latin1(target, buffer, fileLength);
+	}
+}
+
+String dsr::string_loadFromMemory(const Buffer &fileContent) {
+	String result;
+	AppendStringFromFileBuffer(result, fileContent.getUnsafeData(), fileContent.size);
+	return result;
+}
+
+// Loads a text file of unknown format
+//   Removes carriage-return characters to make processing easy with only line-feed for breaking lines.
+String dsr::string_load(const ReadableString& filename, bool mustExist) {
+	// TODO: Load files using Unicode filenames when available
+	TO_RAW_ASCII(asciiFilename, filename);
+	std::ifstream fileStream(asciiFilename, std::ios_base::in | std::ios_base::binary);
+	if (fileStream.is_open()) {
+		String result;
+		// Get the file's length and allocate an array for the raw encoding
+		fileStream.seekg (0, fileStream.end);
+		int64_t fileLength = fileStream.tellg();
+		fileStream.seekg (0, fileStream.beg);
+		uint8_t* buffer = (uint8_t*)malloc(fileLength);
+		fileStream.read((char*)buffer, fileLength);
+		AppendStringFromFileBuffer(result, buffer, fileLength);
+		free(buffer);
+		return result;
+	} else {
+		if (mustExist) {
+			throwError(U"The text file ", filename, U" could not be opened for reading.\n");
+		}
+		// If the file cound not be found and opened, a null string is returned
+		return String();
+	}
+}
 
 
 void dsr::string_save(const ReadableString& filename, const ReadableString& content) {
 void dsr::string_save(const ReadableString& filename, const ReadableString& content) {
 	// TODO: Load files using Unicode filenames
 	// TODO: Load files using Unicode filenames
@@ -371,6 +494,27 @@ void dsr::string_save(const ReadableString& filename, const ReadableString& cont
 		throwError("Failed to save ", filename, "\n");
 		throwError("Failed to save ", filename, "\n");
 	}
 	}
 }
 }
+/*
+// TODO: Choose how to encode characters and line endings using enums
+class enum textEncoding {
+	UTF1, UTF7, UTF8, UTF16BE, UTF16LE, UTF32BE, UTF32LE, UTF-EBCDIC, SCSU, BOCU1, GB18030
+};
+* class enum lineEncoding {
+	UTF1, UTF7, UTF8, UTF16BE, UTF16LE, UTF32BE, UTF32LE, UTF-EBCDIC, SCSU, BOCU1, GB18030
+};
+void dsr::string_save(const ReadableString& filename, const ReadableString& content) {
+	// TODO: Load files using Unicode filenames
+	TO_RAW_ASCII(asciiFilename, filename);
+	TO_RAW_ASCII(asciiContent, content);
+	std::ofstream fileStream(asciiFilename, std::ios_base::out | std::ios_base::binary);
+	if (fileStream.is_open()) {
+		fileStream << asciiContent;
+		fileStream.close();
+	} else {
+		throwError("Failed to save ", filename, "\n");
+	}
+}
+*/
 
 
 const char32_t* dsr::file_separator() {
 const char32_t* dsr::file_separator() {
 	#ifdef _WIN32
 	#ifdef _WIN32

+ 4 - 0
Source/DFPSR/base/text.h

@@ -278,6 +278,10 @@ double string_toDouble(const ReadableString& source);
 //   If mustExist is true, then failure to load will throw an exception.
 //   If mustExist is true, then failure to load will throw an exception.
 //   If mustExist is false, then failure to load will return an empty string.
 //   If mustExist is false, then failure to load will return an empty string.
 String string_load(const ReadableString& filename, bool mustExist = true);
 String string_load(const ReadableString& filename, bool mustExist = true);
+// A version loading the text from a binary representation of the file's content instead of the filename
+//   Makes it easier to test character encoding and load arbitrary files from archives.
+String string_loadFromMemory(const Buffer &fileContent);
+
 // Side-effect: Saves content to filename.
 // Side-effect: Saves content to filename.
 void string_save(const ReadableString& filename, const ReadableString& content);
 void string_save(const ReadableString& filename, const ReadableString& content);
 
 

+ 103 - 0
Source/test/tests/TextEncodingTest.cpp

@@ -0,0 +1,103 @@
+
+#include "../testTools.h"
+
+String expected_latin1 =
+R"QUOTE(Hello my friend.
+Hej min vän
+Halló, vinur minn
+Hei ystäväni
+Hola mi amigo
+Ciao amico
+
+This is Latin-1)QUOTE";
+
+String unicodeContent =
+R"QUOTE(Hello my friend.
+Hej min vän
+Halló, vinur minn
+Hei ystäväni
+Hola mi amigo
+Ciao amico
+你好我的朋友
+こんにちは、友よ
+नमस्ते मेरो साथी
+Talofa laʻu uo
+Xin chào bạn của tôi
+העלא מיין פרייַנד
+안녕 내 친구
+سلام دوست من
+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
+ওহে, বন্ধু আমার
+សួស្តី​សម្លាញ់
+Γεια σου φίλε μου
+Привет, мой друг
+здраво пријатељу
+Բարեւ իմ ընկեր
+ආයුබෝවන් මාගේ යාළුවා
+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
+Silav hevalê min
+اهلا صديقي
+)QUOTE";
+String expected_utf8 = unicodeContent + U"\nThis is UTF-8";
+String expected_utf16le = unicodeContent + U"\nThis is UTF-16 Little Endian";
+String expected_utf16be = unicodeContent + U"\nThis is UTF-8 Big Endian";
+
+void printCharacterCode(uint32_t value) {
+	for (int i = 0; i < 32; i++) {
+		if (value & 0b10000000000000000000000000000000) {
+			printText(U"1");
+		} else {
+			printText(U"0");
+		}
+		value = value << 1;
+	}
+}
+
+// Method for printing the character codes of a string for debugging
+void compareCharacterCodes(String textA, String textB) {
+	int lengthA = string_length(textA);
+	int lengthB = string_length(textB);
+	int minLength = lengthA < lengthB ? lengthA : lengthB;
+	printText("Character codes for strings of length ", lengthA, U" and ", lengthB, U":\n");
+	for (int i = 0; i < minLength; i++) {
+		uint32_t codeA = (uint32_t)textA[i];
+		uint32_t codeB = (uint32_t)textB[i];
+		printCharacterCode(codeA);
+		if (codeA == codeB) {
+			printText(U" == ");
+		} else {
+			printText(U" != ");
+		}
+		printCharacterCode(codeB);
+		printText(U" (", textA[i], U") (", textB[i], U")\n");
+	}
+}
+
+START_TEST(TextEncoding)
+	{ // Text encodings stored in memory
+		// TODO: Test string_loadFromMemory
+		
+		
+	}
+	{ // Loading strings of different encodings
+		String folderPath = string_combine(U"test", file_separator(), U"tests", file_separator(), U"resources", file_separator());
+
+		String fileLatin1 = string_load(folderPath + U"Latin1.txt", true);
+		printText("Latin1.txt contains:\n", fileLatin1, "\n");
+		compareCharacterCodes(fileLatin1, expected_latin1);
+		ASSERT_MATCH(fileLatin1, expected_latin1);
+
+		String fileUTF8 = string_load(folderPath + U"BomUtf8.txt", true);
+		printText("BomUtf8.txt contains:\n", fileUTF8, "\n");
+		compareCharacterCodes(fileUTF8, expected_utf8);
+		ASSERT_MATCH(fileUTF8, expected_utf8);
+
+		//String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
+		//printText("BomUtf16Le.txt contains:\n", fileUTF16LE, "\n");
+		//ASSERT_MATCH(fileUTF16LE, expected_utf16le);
+
+		//String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
+		//printText("BomUtf16Be.txt contains:\n", fileUTF16BE, "\n");
+		//ASSERT_MATCH(fileUTF16BE, expected_utf16be);
+	}
+END_TEST

BIN
Source/test/tests/resources/BomUtf16Be.txt


BIN
Source/test/tests/resources/BomUtf16Le.txt


+ 27 - 0
Source/test/tests/resources/BomUtf8.txt

@@ -0,0 +1,27 @@
+Hello my friend.
+Hej min vän
+Halló, vinur minn
+Hei ystäväni
+Hola mi amigo
+Ciao amico
+你好我的朋友
+こんにちは、友よ
+नमस्ते मेरो साथी
+Talofa laʻu uo
+Xin chào bạn của tôi
+העלא מיין פרייַנד
+안녕 내 친구
+سلام دوست من
+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
+ওহে, বন্ধু আমার
+សួស្តី​សម្លាញ់
+Γεια σου φίλε μου
+Привет, мой друг
+здраво пријатељу
+Բարեւ իմ ընկեր
+ආයුබෝවන් මාගේ යාළුවා
+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
+Silav hevalê min
+اهلا صديقي
+
+This is UTF-8

+ 8 - 0
Source/test/tests/resources/Latin1.txt

@@ -0,0 +1,8 @@
+Hello my friend.
+Hej min vän
+Halló, vinur minn
+Hei ystäväni
+Hola mi amigo
+Ciao amico
+
+This is Latin-1