|
@@ -37,11 +37,11 @@ import com.jme3.export.SavableClassUtil;
|
|
import com.jme3.util.BufferUtils;
|
|
import com.jme3.util.BufferUtils;
|
|
import com.jme3.util.IntMap;
|
|
import com.jme3.util.IntMap;
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
-import java.io.UnsupportedEncodingException;
|
|
|
|
import java.nio.ByteBuffer;
|
|
import java.nio.ByteBuffer;
|
|
import java.nio.FloatBuffer;
|
|
import java.nio.FloatBuffer;
|
|
import java.nio.IntBuffer;
|
|
import java.nio.IntBuffer;
|
|
import java.nio.ShortBuffer;
|
|
import java.nio.ShortBuffer;
|
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
import java.util.ArrayList;
|
|
import java.util.ArrayList;
|
|
import java.util.BitSet;
|
|
import java.util.BitSet;
|
|
import java.util.HashMap;
|
|
import java.util.HashMap;
|
|
@@ -1013,129 +1013,17 @@ final class BinaryInputCapsule implements InputCapsule {
|
|
return value;
|
|
return value;
|
|
}
|
|
}
|
|
|
|
|
|
- /*
|
|
|
|
- * UTF-8 crash course:
|
|
|
|
- *
|
|
|
|
- * UTF-8 codepoints map to UTF-16 codepoints and vv, which is what Java uses for its Strings.
|
|
|
|
- * (so a UTF-8 codepoint can contain all possible values for a Java char)
|
|
|
|
- *
|
|
|
|
- * A UTF-8 codepoint can be 1, 2 or 3 bytes long. How long a codepint is can be told by reading the first byte:
|
|
|
|
- * b < 0x80, 1 byte
|
|
|
|
- * (b & 0xC0) == 0xC0, 2 bytes
|
|
|
|
- * (b & 0xE0) == 0xE0, 3 bytes
|
|
|
|
- *
|
|
|
|
- * However there is an additional restriction to UTF-8, to enable you to find the start of a UTF-8 codepoint,
|
|
|
|
- * if you start reading at a random point in a UTF-8 byte stream. That's why UTF-8 requires for the second and third byte of
|
|
|
|
- * a multibyte codepoint:
|
|
|
|
- * (b & 0x80) == 0x80 (in other words, first bit must be 1)
|
|
|
|
- */
|
|
|
|
- private final static int UTF8_START = 0; // next byte should be the start of a new
|
|
|
|
- private final static int UTF8_2BYTE = 2; // next byte should be the second byte of a 2 byte codepoint
|
|
|
|
- private final static int UTF8_3BYTE_1 = 3; // next byte should be the second byte of a 3 byte codepoint
|
|
|
|
- private final static int UTF8_3BYTE_2 = 4; // next byte should be the third byte of a 3 byte codepoint
|
|
|
|
- private final static int UTF8_ILLEGAL = 10; // not an UTF8 string
|
|
|
|
-
|
|
|
|
- // String
|
|
|
|
protected String readString(byte[] content) throws IOException {
|
|
protected String readString(byte[] content) throws IOException {
|
|
int length = readInt(content);
|
|
int length = readInt(content);
|
|
if (length == BinaryOutputCapsule.NULL_OBJECT)
|
|
if (length == BinaryOutputCapsule.NULL_OBJECT)
|
|
return null;
|
|
return null;
|
|
|
|
|
|
- /*
|
|
|
|
- * @see ISSUE 276
|
|
|
|
- *
|
|
|
|
- * We'll transfer the bytes into a separate byte array.
|
|
|
|
- * While we do that we'll take the opportunity to check if the byte data is valid UTF-8.
|
|
|
|
- *
|
|
|
|
- * If it is not UTF-8 it is most likely saved with the BinaryOutputCapsule bug, that saves Strings using their native
|
|
|
|
- * encoding. Unfortunatly there is no way to know what encoding was used, so we'll parse using the most common one in
|
|
|
|
- * that case; latin-1 aka ISO8859_1
|
|
|
|
- *
|
|
|
|
- * Encoding of "low" ASCII codepoint (in plain speak: when no special characters are used) will usually look the same
|
|
|
|
- * for UTF-8 and the other 1 byte codepoint encodings (espc true for numbers and regular letters of the alphabet). So these
|
|
|
|
- * are valid UTF-8 and will give the same result (at most a few charakters will appear different, such as the euro sign).
|
|
|
|
- *
|
|
|
|
- * However, when "high" codepoints are used (any codepoint that over 0x7F, in other words where the first bit is a 1) it's
|
|
|
|
- * a different matter and UTF-8 and the 1 byte encoding greatly will differ, as well as most 1 byte encodings relative to each
|
|
|
|
- * other.
|
|
|
|
- *
|
|
|
|
- * It is impossible to detect which one-byte encoding is used. Since UTF8 and practically all 1-byte encodings share the most
|
|
|
|
- * used characters (the "none-high" ones) parsing them will give the same result. However, not all byte sequences are legal in
|
|
|
|
- * UTF-8 (see explantion above). If not UTF-8 encoded content is detected we therefore fall back on latin1. We also log a warning.
|
|
|
|
- *
|
|
|
|
- * By this method we detect all use of 1 byte encoding if they:
|
|
|
|
- * - use a "high" codepoint after a "low" codepoint or a sequence of codepoints that is valid as UTF-8 bytes, that starts with 1000
|
|
|
|
- * - use a "low" codepoint after a "high" codepoint
|
|
|
|
- * - use a "low" codepoint after "high" codepoint, after a "high" codepoint that starts with 1110
|
|
|
|
- *
|
|
|
|
- * In practise this means that unless 2 or 3 "high" codepoints are used after each other in proper order, we'll detect the string
|
|
|
|
- * was not originally UTF-8 encoded.
|
|
|
|
- *
|
|
|
|
- */
|
|
|
|
byte[] bytes = new byte[length];
|
|
byte[] bytes = new byte[length];
|
|
- int utf8State = UTF8_START;
|
|
|
|
- int b;
|
|
|
|
for (int x = 0; x < length; x++) {
|
|
for (int x = 0; x < length; x++) {
|
|
bytes[x] = content[index++];
|
|
bytes[x] = content[index++];
|
|
- b = (int) bytes[x] & 0xFF; // unsign our byte
|
|
|
|
-
|
|
|
|
- switch (utf8State) {
|
|
|
|
- case UTF8_START:
|
|
|
|
- if (b < 0x80) {
|
|
|
|
- // good
|
|
|
|
- }
|
|
|
|
- else if ((b & 0xC0) == 0xC0) {
|
|
|
|
- utf8State = UTF8_2BYTE;
|
|
|
|
- }
|
|
|
|
- else if ((b & 0xE0) == 0xE0) {
|
|
|
|
- utf8State = UTF8_3BYTE_1;
|
|
|
|
- }
|
|
|
|
- else {
|
|
|
|
- utf8State = UTF8_ILLEGAL;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case UTF8_3BYTE_1:
|
|
|
|
- case UTF8_3BYTE_2:
|
|
|
|
- case UTF8_2BYTE:
|
|
|
|
- if ((b & 0x80) == 0x80)
|
|
|
|
- utf8State = utf8State == UTF8_3BYTE_1 ? UTF8_3BYTE_2 : UTF8_START;
|
|
|
|
- else
|
|
|
|
- utf8State = UTF8_ILLEGAL;
|
|
|
|
- break;
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
|
|
- try {
|
|
|
|
- // even though so far the parsing might have been a legal UTF-8 sequence, only if a codepoint is fully given is it correct UTF-8
|
|
|
|
- if (utf8State == UTF8_START) {
|
|
|
|
- // Java misspells UTF-8 as UTF8 for official use in java.lang
|
|
|
|
- return new String(bytes, "UTF8");
|
|
|
|
- }
|
|
|
|
- else {
|
|
|
|
- logger.log(
|
|
|
|
- Level.WARNING,
|
|
|
|
- "Your export has been saved with an incorrect encoding for its String fields which means it might not load correctly " +
|
|
|
|
- "due to encoding issues. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
|
|
|
|
- );
|
|
|
|
- // We use ISO8859_1 to be consistent across platforms. We could default to native encoding, but this would lead to inconsistent
|
|
|
|
- // behaviour across platforms!
|
|
|
|
- // Developers that have previously saved their exports using the old exporter (which uses native encoding), can temporarly
|
|
|
|
- // remove the ""ISO8859_1" parameter, and change the above if statement to "if (false)".
|
|
|
|
- // They should then import and re-export their models using the same environment they were originally created in.
|
|
|
|
- return new String(bytes, "ISO8859_1");
|
|
|
|
- }
|
|
|
|
- } catch (UnsupportedEncodingException uee) {
|
|
|
|
- // as a last resort fall back to platform native.
|
|
|
|
- // JavaDoc is vague about what happens when a decoding a String that contains un undecodable sequence
|
|
|
|
- // it also doesn't specify which encodings have to be supported (though UTF-8 and ISO8859 have been in the SUN JRE since at least 1.1)
|
|
|
|
- logger.log(
|
|
|
|
- Level.SEVERE,
|
|
|
|
- "Your export has been saved with an incorrect encoding or your version of Java is unable to decode the stored string. " +
|
|
|
|
- "While your export may load correctly by falling back, using it on different platforms or java versions might lead to "+
|
|
|
|
- "very strange inconsitenties. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
|
|
|
|
- );
|
|
|
|
- return new String(bytes);
|
|
|
|
- }
|
|
|
|
|
|
+ return new String(bytes, StandardCharsets.UTF_8);
|
|
}
|
|
}
|
|
|
|
|
|
protected String[] readStringArray(byte[] content) throws IOException {
|
|
protected String[] readStringArray(byte[] content) throws IOException {
|
|
@@ -1418,4 +1306,4 @@ final class BinaryInputCapsule implements InputCapsule {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-}
|
|
|
|
|
|
+}
|