5 years ago · db2b210a36
--- a/Source/DFPSR/base/text.cpp
+++ b/Source/DFPSR/base/text.cpp
@@ -350,7 +350,7 @@ static void AppendStringFromFileBuffer_Latin1(String &target, const uint8_t* buf
 
				 }
			
 
				 // Appends the content of buffer as a BOM-free UTF-8 file into target
			
 
				 static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffer, int64_t fileLength) {
			
 
				-	// We know that the result will be at least one character per given byte for UTF-8
			
 
				+	// We know that the result will be at most one character per given byte for UTF-8
			
 
				 	target.reserve(string_length(target) + fileLength);
			
 
				 	for (int64_t i = 0; i < fileLength; i++) {
			
 
				 		uint8_t byteA = buffer[i];
			
@@ -387,33 +387,67 @@ static void AppendStringFromFileBuffer_UTF8(String &target, const uint8_t* buffe
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+template <bool LittleEndian>
			
 
				+uint16_t read16bits(const uint8_t* buffer, int startOffset) {
			
 
				+	uint16_t byteA = buffer[startOffset];
			
 
				+	uint16_t byteB = buffer[startOffset + 1];
			
 
				+	if (LittleEndian) {
			
 
				+		return (byteB << 8) | byteA;
			
 
				+	} else {
			
 
				+		return (byteA << 8) | byteB;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Appends the content of buffer as a BOM-free UTF-16 file into target
			
 
				+template <bool LittleEndian>
			
 
				+static void AppendStringFromFileBuffer_UTF16(String &target, const uint8_t* buffer, int64_t fileLength) {
			
 
				+	// We know that the result will be at most one character per two given bytes for UTF-16
			
 
				+	target.reserve(string_length(target) + (fileLength / 2));
			
 
				+	for (int64_t i = 0; i < fileLength; i += 2) {
			
 
				+		// Read the first 16-bit word
			
 
				+		uint16_t wordA = read16bits<LittleEndian>(buffer, i);
			
 
				+		// Check if another word is needed
			
 
				+		//   Assuming that wordA >= 0x0000 and wordA <= 0xFFFF as uint16_t,
			
 
				+		//   we can just check if it's within the range reserved for 32-bit encoding
			
 
				+		if (wordA <= 0xD7FF || wordA >= 0xE000) {
			
 
				+			// Not in the reserved range, just a single 16-bit character
			
 
				+			feedCharacterFromFile(target, (DsrChar)wordA);
			
 
				+		} else {
			
 
				+			// The given range was reserved and therefore using 32 bits
			
 
				+			i += 2;
			
 
				+			uint16_t wordB = read16bits<LittleEndian>(buffer, i);
			
 
				+			uint32_t higher10Bits = wordA & 0b1111111111;
			
 
				+			uint32_t lower10Bits = wordB & 0b1111111111;
			
 
				+			feedCharacterFromFile(target, (DsrChar)((higher10Bits << 10) | lower10Bits));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				 // Appends the content of buffer as a text file of unknown format into target
			
 
				 static void AppendStringFromFileBuffer(String &target, const uint8_t* buffer, int64_t fileLength) {
			
 
				 	// After removing the BOM bytes, the rest can be seen as a BOM-free text file with a known format
			
 
				-	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
			
 
				+	if (fileLength >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { // UTF-8
			
 
				 		AppendStringFromFileBuffer_UTF8(target, buffer + 3, fileLength - 3);
			
 
				-	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) {
			
 
				+	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) { // UTF-16 BE
			
 
				+		AppendStringFromFileBuffer_UTF16<false>(target, buffer + 2, fileLength - 2);
			
 
				+	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) { // UTF-16 LE
			
 
				+		AppendStringFromFileBuffer_UTF16<true>(target, buffer + 2, fileLength - 2);
			
 
				+	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) { // UTF-32 BE
			
 
				+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
			
 
				+		throwError(U"UTF-32 BE format is not yet supported!\n");
			
 
				+	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) { // UTF-32 LE
			
 
				+		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
			
 
				+		throwError(U"UTF-32 LE format is not yet supported!\n");
			
 
				+	} else if (fileLength >= 3 && buffer[0] == 0xF7 && buffer[1] == 0x64 && buffer[2] == 0x4C) { // UTF-1
			
 
				 		//AppendStringFromFileBuffer_UTF1(target, buffer + 3, fileLength - 3);
			
 
				 		throwError(U"UTF-1 format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) {
			
 
				+	} else if (fileLength >= 3 && buffer[0] == 0x0E && buffer[1] == 0xFE && buffer[2] == 0xFF) { // SCSU
			
 
				 		//AppendStringFromFileBuffer_SCSU(target, buffer + 3, fileLength - 3);
			
 
				 		throwError(U"SCSU format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) {
			
 
				+	} else if (fileLength >= 3 && buffer[0] == 0xFB && buffer[1] == 0xEE && buffer[2] == 0x28) { // BOCU
			
 
				 		//AppendStringFromFileBuffer_BOCU-1(target, buffer + 3, fileLength - 3);
			
 
				 		throwError(U"BOCU-1 format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) {
			
 
				-		//AppendStringFromFileBuffer_UTF16BE(target, buffer + 2, fileLength - 2);
			
 
				-		throwError(U"UTF-16 BE format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) {
			
 
				-		//AppendStringFromFileBuffer_UTF16LE(target, buffer + 2, fileLength - 2);
			
 
				-		throwError(U"UTF-16 LE format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) {
			
 
				-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
			
 
				-		throwError(U"UTF-32 BE format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) {
			
 
				-		//AppendStringFromFileBuffer_UTF32BE(target, buffer + 4, fileLength - 4);
			
 
				-		throwError(U"UTF-32 LE format is not yet supported!\n");
			
 
				-	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) {
			
 
				+	} else if (fileLength >= 4 && buffer[0] == 0x2B && buffer[1] == 0x2F && buffer[2] == 0x76) { // UTF-7
			
 
				 		// Ignoring fourth byte with the dialect of UTF-7 when just showing the error message
			
 
				 		throwError(U"UTF-7 format is not yet supported!\n");
			
 
				 	} else {
			
@@ -459,13 +493,17 @@ static inline void byteToStream(std::ostream &target, int value) {
 
				 	target.write((char*)&byte, 1);
			
 
				 }
			
 
				 
			
 
				+#define AT_MOST_BITS(BIT_COUNT) if (character >= 1 << BIT_COUNT) { character = U'?'; }
			
 
				+
			
 
				 template <CharacterEncoding characterEncoding>
			
 
				 static void encodeCharacterToStream(std::ostream &target, DsrChar character) {
			
 
				 	if (characterEncoding == CharacterEncoding::Raw_Latin1) {
			
 
				 		// Replace any illegal characters with questionmarks
			
 
				-		if (character > 255) { character = U'?'; }
			
 
				+		AT_MOST_BITS(8);
			
 
				 		byteToStream(target, character);
			
 
				 	} else if (characterEncoding == CharacterEncoding::BOM_UTF8) {
			
 
				+		// Replace any illegal characters with questionmarks
			
 
				+		AT_MOST_BITS(21);
			
 
				 		if (character < (1 << 7)) {
			
 
				 			// 0xxxxxxx
			
 
				 			byteToStream(target, character);
			
@@ -485,10 +523,39 @@ static void encodeCharacterToStream(std::ostream &target, DsrChar character) {
 
				 			byteToStream(target, 0b10000000 | ((character & (0b111111 << 6)) >> 6));
			
 
				 			byteToStream(target, 0b10000000 | (character & 0b111111));
			
 
				 		}
			
 
				-	} else if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
			
 
				-		throwError(U"Saving text files in UTF-16 BE is not yet implemented.\n");
			
 
				-	} else { // Assuming that characterEncoding == CharacterEncoding::BOM_UTF16LE
			
 
				-		throwError(U"Saving text files in UTF-16 LE is not yet implemented.\n");
			
 
				+	} else { // Assuming UTF-16
			
 
				+		AT_MOST_BITS(20);
			
 
				+		if (character <= 0xD7FF || (character >= 0xE000 && character <= 0xFFFF)) {
			
 
				+			// xxxxxxxx xxxxxxxx (Limited range)
			
 
				+			uint32_t higher8Bits = (character & 0b1111111100000000) >> 8;
			
 
				+			uint32_t lower8Bits  =  character & 0b0000000011111111;
			
 
				+			if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
			
 
				+				byteToStream(target, higher8Bits);
			
 
				+				byteToStream(target, lower8Bits);
			
 
				+			} else { // Assuming UTF-16 LE
			
 
				+				byteToStream(target, lower8Bits);
			
 
				+				byteToStream(target, higher8Bits);
			
 
				+			}
			
 
				+		} else if (character >= 0x010000 && character <= 0x10FFFF) {
			
 
				+			// 110110xxxxxxxxxx 110111xxxxxxxxxx
			
 
				+			uint32_t higher10Bits = (character & 0b11111111110000000000) >> 10;
			
 
				+			uint32_t lower10Bits  =  character & 0b00000000001111111111;
			
 
				+			uint32_t byteA = (0b110110 << 2) | ((higher10Bits & (0b11 << 8)) >> 8);
			
 
				+			uint32_t byteB = higher10Bits & 0b11111111;
			
 
				+			uint32_t byteC = (0b110111 << 2) | ((lower10Bits & (0b11 << 8)) >> 8);
			
 
				+			uint32_t byteD = lower10Bits & 0b11111111;
			
 
				+			if (characterEncoding == CharacterEncoding::BOM_UTF16BE) {
			
 
				+				byteToStream(target, byteA);
			
 
				+				byteToStream(target, byteB);
			
 
				+				byteToStream(target, byteC);
			
 
				+				byteToStream(target, byteD);
			
 
				+			} else { // Assuming UTF-16 LE
			
 
				+				byteToStream(target, byteB);
			
 
				+				byteToStream(target, byteA);
			
 
				+				byteToStream(target, byteD);
			
 
				+				byteToStream(target, byteC);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/Source/test/tests/TextEncodingTest.cpp
+++ b/Source/test/tests/TextEncodingTest.cpp
@@ -45,7 +45,7 @@ Silav hevalê min
 
				 )QUOTE";
			
 
				 String expected_utf8 = unicodeContent + U"\nThis is UTF-8";
			
 
				 String expected_utf16le = unicodeContent + U"\nThis is UTF-16 Little Endian";
			
 
				-String expected_utf16be = unicodeContent + U"\nThis is UTF-8 Big Endian";
			
 
				+String expected_utf16be = unicodeContent + U"\nThis is UTF-16 Big Endian";
			
 
				 
			
 
				 void printCharacterCode(uint32_t value) {
			
 
				 	for (int i = 0; i < 32; i++) {
			
@@ -108,24 +108,35 @@ START_TEST(TextEncoding)
 
				 		//compareCharacterCodes(fileUTF8, expected_utf8);
			
 
				 		ASSERT_MATCH(fileUTF8, expected_utf8);
			
 
				 
			
 
				-		//String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
			
 
				-		//printText("BomUtf16Le.txt contains:\n", fileUTF16LE, "\n");
			
 
				-		//ASSERT_MATCH(fileUTF16LE, expected_utf16le);
			
 
				+		String fileUTF16LE = string_load(folderPath + U"BomUtf16Le.txt", true);
			
 
				+		//compareCharacterCodes(fileUTF16LE, expected_utf16le);
			
 
				+		ASSERT_MATCH(fileUTF16LE, expected_utf16le);
			
 
				 
			
 
				-		//String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
			
 
				-		//printText("BomUtf16Be.txt contains:\n", fileUTF16BE, "\n");
			
 
				-		//ASSERT_MATCH(fileUTF16BE, expected_utf16be);
			
 
				+		String fileUTF16BE = string_load(folderPath + U"BomUtf16Be.txt", true);
			
 
				+		//compareCharacterCodes(fileUTF16BE, expected_utf16be);
			
 
				+		ASSERT_MATCH(fileUTF16BE, expected_utf16be);
			
 
				 	}
			
 
				-	{ // Saving text to files
			
 
				+	{ // Saving and loading text to files using every combination of character and line encoding
			
 
				 		String originalContent = U"Hello my friend\n你好我的朋友";
			
 
				+		String latin1Content = U"Hello my friend\n??????";
			
 
				 		String tempPath = folderPath + U"Temporary.txt";
			
 
				-		
			
 
				-		// Latin-1 should write ? for complex characters
			
 
				-		string_save(tempPath, originalContent, CharacterEncoding::Raw_Latin1, LineEncoding::CrLf);
			
 
				-		ASSERT_MATCH(string_load(tempPath, true), U"Hello my friend\n??????");
			
 
				-		
			
 
				-		// UFT-8 should store the Chinese characters correctly
			
 
				-		string_save(tempPath, originalContent, CharacterEncoding::BOM_UTF8, LineEncoding::CrLf);
			
 
				-		ASSERT_MATCH(string_load(tempPath, true), originalContent);
			
 
				+		for (int i = 0; i < 2; i++) {
			
 
				+			LineEncoding lineEncoding = (i == 0) ? LineEncoding::CrLf : LineEncoding::Lf;
			
 
				+
			
 
				+			// Latin-1 should store up to 8 bits correctly, and write ? for complex characters
			
 
				+			string_save(tempPath, originalContent, CharacterEncoding::Raw_Latin1, lineEncoding);
			
 
				+			ASSERT_MATCH(string_load(tempPath, true), U"Hello my friend\n??????");
			
 
				+
			
 
				+			// UFT-8 should store up to 21 bits correctly
			
 
				+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF8, lineEncoding);
			
 
				+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
			
 
				+
			
 
				+			// UFT-16 should store up to 20 bits correctly
			
 
				+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16BE, lineEncoding);
			
 
				+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
			
 
				+			string_save(tempPath, unicodeContent, CharacterEncoding::BOM_UTF16LE, lineEncoding);
			
 
				+			ASSERT_MATCH(string_load(tempPath, true), unicodeContent);
			
 
				+			string_save(tempPath, U"This file is used when testing text encoding.");
			
 
				+		}
			
 
				 	}
			
 
				 END_TEST
			
--- a/Source/test/tests/resources/BomUtf16Be.txt
+++ b/Source/test/tests/resources/BomUtf16Be.txt
--- a/Source/test/tests/resources/BomUtf16Le.txt
+++ b/Source/test/tests/resources/BomUtf16Le.txt
--- a/Source/test/tests/resources/BomUtf8.txt
+++ b/Source/test/tests/resources/BomUtf8.txt
@@ -1,27 +1,27 @@
 
				-Hello my friend
			
 
				-Hej min vän
			
 
				-Halló, vinur minn
			
 
				-Hei ystäväni
			
 
				-Hola mi amigo
			
 
				-Ciao amico
			
 
				-你好我的朋友
			
 
				-こんにちは、友よ
			
 
				-नमस्ते मेरो साथी
			
 
				-Talofa laʻu uo
			
 
				-Xin chào bạn của tôi
			
 
				-העלא מיין פרייַנד
			
 
				-안녕 내 친구
			
 
				-سلام دوست من
			
 
				-ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ
			
 
				-ওহে, বন্ধু আমার
			
 
				-សួស្តីសម្លាញ់
			
 
				-Γεια σου φίλε μου
			
 
				-Привет, мой друг
			
 
				-здраво пријатељу
			
 
				-Բարեւ իմ ընկեր
			
 
				-ආයුබෝවන් මාගේ යාළුවා
			
 
				-ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ
			
 
				-Silav hevalê min
			
 
				-اهلا صديقي
			
 
				-
			
 
				+Hello my friend

			
 
				+Hej min vän

			
 
				+Halló, vinur minn

			
 
				+Hei ystäväni

			
 
				+Hola mi amigo

			
 
				+Ciao amico

			
 
				+你好我的朋友

			
 
				+こんにちは、友よ

			
 
				+नमस्ते मेरो साथी

			
 
				+Talofa laʻu uo

			
 
				+Xin chào bạn của tôi

			
 
				+העלא מיין פרייַנד

			
 
				+안녕 내 친구

			
 
				+سلام دوست من

			
 
				+ਹੈਲੋ ਮੇਰੇ ਦੋਸਤ

			
 
				+ওহে, বন্ধু আমার

			
 
				+សួស្តីសម្លាញ់

			
 
				+Γεια σου φίλε μου

			
 
				+Привет, мой друг

			
 
				+здраво пријатељу

			
 
				+Բարեւ իմ ընկեր

			
 
				+ආයුබෝවන් මාගේ යාළුවා

			
 
				+ಹಲೋ ನನ್ನ ಸ್ನೇಹಿತನೇ

			
 
				+Silav hevalê min

			
 
				+اهلا صديقي

			
 
				+

			
 
				 This is UTF-8
			
--- a/Source/test/tests/resources/Latin1.txt
+++ b/Source/test/tests/resources/Latin1.txt
@@ -1,6 +1,6 @@
 
				-Hello my friend
			
 
				-Hej min vän
			
 
				-Halló, vinur minn
			
 
				-Hei ystäväni
			
 
				-Hola mi amigo
			
 
				-Ciao amico
			
 
				+Hello my friend

			
 
				+Hej min vän

			
 
				+Halló, vinur minn

			
 
				+Hei ystäväni

			
 
				+Hola mi amigo

			
 
				+Ciao amico

			
--- a/Source/test/tests/resources/Temporary.txt
+++ b/Source/test/tests/resources/Temporary.txt
@@ -0,0 +1 @@
 
				+This file is used when testing text encoding.
		`@@ -0,0 +1 @@`
		`+This file is used when testing text encoding.`