5 years ago · db2b210a36
--- a/Source/DFPSR/base/text.cpp
+++ b/Source/DFPSR/base/text.cpp
@@ -350,7 +350,7 @@ static void AppendStringFromFileBuffer_Latin1(String &target, const uint8_t* buf
 
															 }
														
 
															 // Appends the content of buffer as a BOM-free UTF-8 file into target
														
 
															 static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															-	// We know that the result will be at least one character per given byte for UTF-8
														
 
															+	// We know that the result will be at most one character per given byte for UTF-8
														
 
															 	target.reserve(string_length(target) + fileLength);
														
 
															 	for (int64_t i = 0; i < fileLength; i++) {
														
 
															 		uint8_t byteA = buffer[i];
														
@@ -387,33 +387,67 @@ static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffe
 
															 		}
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+template <bool LittleEndian>
														
 
															+uint16_t read16bits(const uint8_t* buffer, int startOffset) {
														
 
															+	uint16_t byteA = buffer[startOffset];
														
 
															+	uint16_t byteB = buffer[startOffset + 1];
														
 
															+	if (LittleEndian) {
														
 
															+		return (byteB << 8) | byteA;
														
 
															+	} else {
														
 
															+		return (byteA << 8) | byteB;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// Appends the content of buffer as a BOM-free UTF-16 file into target
														
 
															+template <bool LittleEndian>
														
 
															+static void AppendStringFromFileBuffer_UTF16(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															+	// We know that the result will be at most one character per two given bytes for UTF-16
														
 
															+	target.reserve(string_length(target) + (fileLength / 2));
														
 
															+	for (int64_t i = 0; i < fileLength; i += 2) {
														
 
															+		// Read the first 16-bit word
														
 
															+		uint16_t wordA = read16bits<LittleEndian>(buffer, i);
														
 
															+		// Check if another word is needed
														
 
															+		//   Assuming that wordA >= 0x0000 and wordA <= 0xFFFF as uint16_t,
														
 
															+		//   we can just check if it's within the range reserved for 32-bit encoding
														
 
															+		if (wordA <= 0xD7FF || wordA >= 0xE000) {
														
 
															+			// Not in the reserved range, just a single 16-bit character
														
 
															+			feedCharacterFromFile(target, (DsrChar)wordA);
														
 
															+		} else {
														
 
															+			// The given range was reserved and therefore using 32 bits
														
 
															+			i += 2;
														
 
															+			uint16_t wordB = read16bits<LittleEndian>(buffer, i);
														
 
															+			uint32_t higher10Bits = wordA & 0b1111111111;
														
 
															+			uint32_t lower10Bits = wordB & 0b1111111111;
														
 
															+			feedCharacterFromFile(target, (DsrChar)((higher10Bits << 10) | lower10Bits));
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															 // Appends the content of buffer as a text file of unknown format into target
														
 
															 static void AppendStringFromFileBuffer(String &target, const uint8_t* buffer, int64_t fileLength) {
														
 
															 	// After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
														
 
															-	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
														
 
															+	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
														
 
															 		AppendStringFromFileBuffer_UTF8(target, buffer + 3, fileLength - 3);
														
 
															-	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) {
														
 
															+	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
														
 
															+		AppendStringFromFileBuffer_UTF16<false>(target, buffer + 2, fileLength - 2);
														
 
															+	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
														
 
															+		AppendStringFromFileBuffer_UTF16<true>(target, buffer + 2, fileLength - 2);
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
														
 
															+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															+		throwError(U"UTF-32 BE format is not yet supported!\n");
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
														
 
															+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															+		throwError(U"UTF-32 LE format is not yet supported!\n");
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
														
 
															 		//AppendStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
														
 
															 		throwError(U"UTF-1 format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) {
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
														
 
															 		//AppendStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
														
 
															 		throwError(U"SCSU format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) {
														
 
															+	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
														
 
															 		//AppendStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
														
 
															 		throwError(U"BOCU-1 format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) {
														
 
															-		//AppendStringFromFileBuffer_UTF16BE(target, buffer + 2, fileLength - 2);
														
 
															-		throwError(U"UTF-16 BE format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) {
														
 
															-		//AppendStringFromFileBuffer_UTF16LE(target, buffer + 2, fileLength - 2);
														
 
															-		throwError(U"UTF-16 LE format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) {
														
 
															-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															-		throwError(U"UTF-32 BE format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) {
														
 
															-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
														
 
															-		throwError(U"UTF-32 LE format is not yet supported!\n");
														
 
															-	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) {
														
 
															+	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
														
 
															 		// Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
														
 
															 		throwError(U"UTF-7 format is not yet supported!\n");
														
 
															 	} else {
														
@@ -459,13 +493,17 @@ static inline void byteToStream(std::ostream &target, int value) {
 
															 	target.write((char*)&byte, 1);
														
 
															 }
														
 
															+#define AT_MOST_BITS(BIT_COUNT) if (character >= 1 << BIT_COUNT) { character = U'?'; }
														
 
															+
														
 
															 template <CharacterEncoding characterEncoding>
														
 
															 static void encodeCharacterToStream(std::ostream &target, DsrChar character) {
														
 
															 	if (characterEncoding == CharacterEncoding::Raw_Latin1) {
														
 
															 		// Replace any illegal characters with questionmarks
														
 
															-		if (character > 255) { character = U'?'; }
														
 
															+		AT_MOST_BITS(8);
														
 
															 		byteToStream(target, character);
														
 
															 	} else if (characterEncoding == CharacterEncoding::BOM_UTF8) {
														
 
															+		// Replace any illegal characters with questionmarks
														
 
															+		AT_MOST_BITS(21);
														
 
															 		if (character < (1 << 7)) {
														
 
															 			// 0xxxxxxx
														
 
															 			byteToStream(target, character);
														
@@ -485,10 +523,39 @@ static void encodeCharacterToStream(std::ostream &target, DsrChar character) {
 
															 			byteToStream(target, 0b10000000 | ((character & (0b111111 << 6)) >> 6));
														
 
															 			byteToStream(target, 0b10000000 | (character & 0b111111));
														
 
															 		}
														
 
															-	} else if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
														
 
															-		throwError(U"Saving text files in UTF-16 BE is not yet implemented.\n");
														
 
															-	} else { // Assuming that characterEncoding == CharacterEncoding::BOM_UTF16LE
														
 
															-		throwError(U"Saving text files in UTF-16 LE is not yet implemented.\n");
														
 
															+	} else { // Assuming UTF-16
														
 
															+		AT_MOST_BITS(20);
														
 
															+		if (character <= 0xD7FF || (character >= 0xE000 && character <= 0xFFFF)) {
														
 
															+			// xxxxxxxx xxxxxxxx (Limited range)
														
 
															+			uint32_t higher8Bits = (character & 0b1111111100000000) >> 8;
														
 
															+			uint32_t lower8Bits  =  character & 0b0000000011111111;
														
 
															+			if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
														
 
															+				byteToStream(target, higher8Bits);
														
 
															+				byteToStream(target, lower8Bits);
														
 
															+			} else { // Assuming UTF-16 LE
														
 
															+				byteToStream(target, lower8Bits);
														
 
															+				byteToStream(target, higher8Bits);
														
 
															+			}
														
 
															+		} else if (character >= 0x010000 && character <= 0x10FFFF) {
														
 
															+			// 110110xxxxxxxxxx 110111xxxxxxxxxx
														
 
															+			uint32_t higher10Bits = (character & 0b11111111110000000000) >> 10;
														
 
															+			uint32_t lower10Bits  =  character & 0b00000000001111111111;
														
 
															+			uint32_t byteA = (0b110110 << 2) | ((higher10Bits & (0b11 << 8)) >> 8);
														
 
															+			uint32_t byteB = higher10Bits & 0b11111111;
														
 
															+			uint32_t byteC = (0b110111 << 2) | ((lower10Bits & (0b11 << 8)) >> 8);
														
 
															+			uint32_t byteD = lower10Bits & 0b11111111;
														
 
															+			if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
														
 
															+				byteToStream(target, byteA);
														
 
															+				byteToStream(target, byteB);
														
 
															+				byteToStream(target, byteC);
														
 
															+				byteToStream(target, byteD);
														
 
															+			} else { // Assuming UTF-16 LE
														
 
															+				byteToStream(target, byteB);
														
 
															+				byteToStream(target, byteA);
														
 
															+				byteToStream(target, byteD);
														
 
															+				byteToStream(target, byteC);
														
 
															+			}
														
 
															+		}
														
 
															 	}
														
 
															 }
														
--- a/Source/test/tests/TextEncodingTest.cpp
+++ b/Source/test/tests/TextEncodingTest.cpp
@@ -45,7 +45,7 @@ Silav hevalê min
 
															 )QUOTE";
														
 
															 String expected_utf8 = unicodeContent + U"\nThis is UTF-8";
														
 
															 String expected_utf16le = unicodeContent + U"\nThis is UTF-16 Little Endian";
														
 
															-String expected_utf16be = unicodeContent + U"\nThis is UTF-8 Big Endian";
														
 
															+String expected_utf16be = unicodeContent + U"\nThis is UTF-16 Big Endian";
														
 
															 void printCharacterCode(uint32_t value) {
														
 
															 	for (int i = 0; i < 32; i++) {
														
@@ -108,24 +108,35 @@ START_TEST(TextEncoding)
 
															 		//compareCharacterCodes(fileUTF8, expected_utf8);
														
 
															 		ASSERT_MATCH(fileUTF8, expected_utf8);
														
 
															-		//String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
														
 
															-		//printText("BomUtf16Le.txt contains:\n", fileUTF16LE, "\n");
														
 
															-		//ASSERT_MATCH(fileUTF16LE, expected_utf16le);
														
 
															+		String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
														
 
															+		//compareCharacterCodes(fileUTF16LE, expected_utf16le);
														
 
															+		ASSERT_MATCH(fileUTF16LE, expected_utf16le);
														
 
															-		//String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
														
 
															-		//printText("BomUtf16Be.txt contains:\n", fileUTF16BE, "\n");
														
 
															-		//ASSERT_MATCH(fileUTF16BE, expected_utf16be);
														
 
															+		String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
														
 
															+		//compareCharacterCodes(fileUTF16BE, expected_utf16be);
														
 
															+		ASSERT_MATCH(fileUTF16BE, expected_utf16be);
														
 
															 	}
														
 
															-	{ // Saving text to files
														
 
															+	{ // Saving and loading text to files using every combination of character and line encoding
														
 
															 		String originalContent = U"Hello my friend\n你好我的朋友";
														
 
															+		String latin1Content = U"Hello my friend\n??????";
														
 
															 		String tempPath = folderPath + U"Temporary.txt";
														
 
															-		
														
 
															-		// Latin-1 should write ? for complex characters
														
 
															-		string_save(tempPath, originalContent, CharacterEncoding::Raw_Latin1, LineEncoding::CrLf);
														
 
															-		ASSERT_MATCH(string_load(tempPath, true), U"Hello my friend\n??????");
														
 
															-		
														
 
															-		// UFT-8 should store the Chinese characters correctly
														
 
															-		string_save(tempPath, originalContent, CharacterEncoding::BOM_UTF8, LineEncoding::CrLf);
														
 
															-		ASSERT_MATCH(string_load(tempPath, true), originalContent);
														
 
															+		for (int i = 0; i < 2; i++) {
														
 
															+			LineEncoding lineEncoding = (i == 0) ? LineEncoding::CrLf : LineEncoding::Lf;
														
 
															+
														
 
															+			// Latin-1 should store up to 8 bits correctly, and write ? for complex characters
														
 
															+			string_save(tempPath, originalContent, CharacterEncoding::Raw_Latin1, lineEncoding);
														
 
															+			ASSERT_MATCH(string_load(tempPath, true), U"Hello my friend\n??????");
														
 
															+
														
 
															+			// UFT-8 should store up to 21 bits correctly
														
 
															+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF8, lineEncoding);
														
 
															+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
														
 
															+
														
 
															+			// UFT-16 should store up to 20 bits correctly
														
 
															+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16BE, lineEncoding);
														
 
															+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
														
 
															+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16LE, lineEncoding);
														
 
															+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
														
 
															+			string_save(tempPath, U"This file is used when testing text encoding.");
														
 
															+		}
														
 
															 	}
														
 
															 END_TEST
														
--- a/Source/test/tests/resources/BomUtf16Be.txt
+++ b/Source/test/tests/resources/BomUtf16Be.txt
--- a/Source/test/tests/resources/BomUtf16Le.txt
+++ b/Source/test/tests/resources/BomUtf16Le.txt
--- a/Source/test/tests/resources/BomUtf8.txt
+++ b/Source/test/tests/resources/BomUtf8.txt
@@ -1,27 +1,27 @@
 
															-Hello my friend
														
 
															-Hej min vän
														
 
															-Halló, vinur minn
														
 
															-Hei ystäväni
														
 
															-Hola mi amigo
														
 
															-Ciao amico
														
 
															-你好我的朋友
														
 
															-こんにちは、友よ
														
 
															-नमस्ते मेरो साथी
														
 
															-Talofa laʻu uo
														
 
															-Xin chào bạn của tôi
														
 
															-העלא מיין פרייַנד
														
 
															-안녕 내 친구
														
 
															-سلام دوست من
														
 
															-ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
														
 
															-ওহে, বন্ধু আমার
														
 
															-សួស្តីសម្លាញ់
														
 
															-Γεια σου φίλε μου
														
 
															-Привет, мой друг
														
 
															-здраво пријатељу
														
 
															-Բարեւ իմ ընկեր
														
 
															-ආයුබෝවන් මාගේ යාළුවා
														
 
															-ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
														
 
															-Silav hevalê min
														
 
															-اهلا صديقي
														
 
															-
														
 
															+Hello my friend

														
 
															+Hej min vän

														
 
															+Halló, vinur minn

														
 
															+Hei ystäväni

														
 
															+Hola mi amigo

														
 
															+Ciao amico

														
 
															+你好我的朋友

														
 
															+こんにちは、友よ

														
 
															+नमस्ते मेरो साथी

														
 
															+Talofa laʻu uo

														
 
															+Xin chào bạn của tôi

														
 
															+העלא מיין פרייַנד

														
 
															+안녕 내 친구

														
 
															+سلام دوست من

														
 
															+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ

														
 
															+ওহে, বন্ধু আমার

														
 
															+សួស្តីសម្លាញ់

														
 
															+Γεια σου φίλε μου

														
 
															+Привет, мой друг

														
 
															+здраво пријатељу

														
 
															+Բարեւ իմ ընկեր

														
 
															+ආයුබෝවන් මාගේ යාළුවා

														
 
															+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ

														
 
															+Silav hevalê min

														
 
															+اهلا صديقي

														
 
															+

														
 
															 This is UTF-8
														
--- a/Source/test/tests/resources/Latin1.txt
+++ b/Source/test/tests/resources/Latin1.txt
@@ -1,6 +1,6 @@
 
															-Hello my friend
														
 
															-Hej min vän
														
 
															-Halló, vinur minn
														
 
															-Hei ystäväni
														
 
															-Hola mi amigo
														
 
															-Ciao amico
														
 
															+Hello my friend

														
 
															+Hej min vän

														
 
															+Halló, vinur minn

														
 
															+Hei ystäväni

														
 
															+Hola mi amigo

														
 
															+Ciao amico

														
--- a/Source/test/tests/resources/Temporary.txt
+++ b/Source/test/tests/resources/Temporary.txt
@@ -0,0 +1 @@
 
															+This file is used when testing text encoding.
	`@@ -0,0 +1 @@`
			`+This file is used when testing text encoding.`