5 years ago · d4a7ad7698
--- a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java
+++ b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java
@@ -37,11 +37,11 @@ import com.jme3.export.SavableClassUtil;
 
															 import com.jme3.util.BufferUtils;
														
 
															 import com.jme3.util.IntMap;
														
 
															 import java.io.IOException;
														
 
															-import java.io.UnsupportedEncodingException;
														
 
															 import java.nio.ByteBuffer;
														
 
															 import java.nio.FloatBuffer;
														
 
															 import java.nio.IntBuffer;
														
 
															 import java.nio.ShortBuffer;
														
 
															+import java.nio.charset.StandardCharsets;
														
 
															 import java.util.ArrayList;
														
 
															 import java.util.BitSet;
														
 
															 import java.util.HashMap;
														
@@ -1013,129 +1013,17 @@ final class BinaryInputCapsule implements InputCapsule {
 
															         return value;
														
 
															     }
														
 
															-    /*
														
 
															-     * UTF-8 crash course:
														
 
															-     *
														
 
															-     * UTF-8 codepoints map to UTF-16 codepoints and vv, which is what Java uses for its Strings.
														
 
															-     * (so a UTF-8 codepoint can contain all possible values for a Java char)
														
 
															-     *
														
 
															-     * A UTF-8 codepoint can be 1, 2 or 3 bytes long. How long a codepint is can be told by reading the first byte:
														
 
															-     * b < 0x80, 1 byte
														
 
															-     * (b & 0xC0) == 0xC0, 2 bytes
														
 
															-     * (b & 0xE0) == 0xE0, 3 bytes
														
 
															-     *
														
 
															-     * However there is an additional restriction to UTF-8, to enable you to find the start of a UTF-8 codepoint,
														
 
															-     * if you start reading at a random point in a UTF-8 byte stream. That's why UTF-8 requires for the second and third byte of
														
 
															-     * a multibyte codepoint:
														
 
															-     * (b & 0x80) == 0x80  (in other words, first bit must be 1)
														
 
															-     */
														
 
															-    private final static int UTF8_START = 0; // next byte should be the start of a new
														
 
															-    private final static int UTF8_2BYTE = 2; // next byte should be the second byte of a 2 byte codepoint
														
 
															-    private final static int UTF8_3BYTE_1 = 3; // next byte should be the second byte of a 3 byte codepoint
														
 
															-    private final static int UTF8_3BYTE_2 = 4; // next byte should be the third byte of a 3 byte codepoint
														
 
															-    private final static int UTF8_ILLEGAL = 10; // not an UTF8 string
														
 
															-
														
 
															-    // String
														
 
															     protected String readString(byte[] content) throws IOException {
														
 
															         int length = readInt(content);
														
 
															         if (length == BinaryOutputCapsule.NULL_OBJECT)
														
 
															             return null;
														
 
															-        /*
														
 
															-         * @see ISSUE 276
														
 
															-         *
														
 
															-         * We'll transfer the bytes into a separate byte array.
														
 
															-         * While we do that we'll take the opportunity to check if the byte data is valid UTF-8.
														
 
															-         *
														
 
															-         * If it is not UTF-8 it is most likely saved with the BinaryOutputCapsule bug, that saves Strings using their native
														
 
															-         * encoding. Unfortunatly there is no way to know what encoding was used, so we'll parse using the most common one in
														
 
															-         * that case; latin-1 aka ISO8859_1
														
 
															-         *
														
 
															-         * Encoding of "low" ASCII codepoint (in plain speak: when no special characters are used) will usually look the same
														
 
															-         * for UTF-8 and the other 1 byte codepoint encodings (espc true for numbers and regular letters of the alphabet). So these
														
 
															-         * are valid UTF-8 and will give the same result (at most a few charakters will appear different, such as the euro sign).
														
 
															-         *
														
 
															-         * However, when "high" codepoints are used (any codepoint that over 0x7F, in other words where the first bit is a 1) it's
														
 
															-         * a different matter and UTF-8 and the 1 byte encoding greatly will differ, as well as most 1 byte encodings relative to each
														
 
															-         * other.
														
 
															-         *
														
 
															-         * It is impossible to detect which one-byte encoding is used. Since UTF8 and practically all 1-byte encodings share the most
														
 
															-         * used characters (the "none-high" ones) parsing them will give the same result. However, not all byte sequences are legal in
														
 
															-         * UTF-8 (see explantion above). If not UTF-8 encoded content is detected we therefore fall back on latin1. We also log a warning.
														
 
															-         *
														
 
															-         * By this method we detect all use of 1 byte encoding if they:
														
 
															-         * - use a "high" codepoint after a "low" codepoint or a sequence of codepoints that is valid as UTF-8 bytes, that starts with 1000
														
 
															-         * - use a "low" codepoint after a "high" codepoint
														
 
															-         * - use a "low" codepoint after "high" codepoint, after a "high" codepoint that starts with 1110
														
 
															-         *
														
 
															-         *  In practise this means that unless 2 or 3 "high" codepoints are used after each other in proper order, we'll detect the string
														
 
															-         *  was not originally UTF-8 encoded.
														
 
															-         *
														
 
															-         */
														
 
															         byte[] bytes = new byte[length];
														
 
															-        int utf8State = UTF8_START;
														
 
															-        int b;
														
 
															         for (int x = 0; x < length; x++) {
														
 
															             bytes[x] =  content[index++];
														
 
															-            b = (int) bytes[x] & 0xFF; // unsign our byte
														
 
															-
														
 
															-            switch (utf8State) {
														
 
															-            case UTF8_START:
														
 
															-                if (b < 0x80) {
														
 
															-                    // good
														
 
															-                }
														
 
															-                else if ((b & 0xC0) == 0xC0) {
														
 
															-                    utf8State = UTF8_2BYTE;
														
 
															-                }
														
 
															-                else if ((b & 0xE0) == 0xE0) {
														
 
															-                    utf8State = UTF8_3BYTE_1;
														
 
															-                }
														
 
															-                else {
														
 
															-                    utf8State = UTF8_ILLEGAL;
														
 
															-                }
														
 
															-                break;
														
 
															-            case UTF8_3BYTE_1:
														
 
															-            case UTF8_3BYTE_2:
														
 
															-            case UTF8_2BYTE:
														
 
															-                 if ((b & 0x80) == 0x80)
														
 
															-                    utf8State = utf8State == UTF8_3BYTE_1 ? UTF8_3BYTE_2 : UTF8_START;
														
 
															-                 else
														
 
															-                    utf8State = UTF8_ILLEGAL;
														
 
															-                break;
														
 
															-            }
														
 
															         }
														
 
															-        try {
														
 
															-            // even though so far the parsing might have been a legal UTF-8 sequence, only if a codepoint is fully given is it correct UTF-8
														
 
															-            if (utf8State == UTF8_START) {
														
 
															-                // Java misspells UTF-8 as UTF8 for official use in java.lang
														
 
															-                return new String(bytes, "UTF8");
														
 
															-            }
														
 
															-            else {
														
 
															-                logger.log(
														
 
															-                        Level.WARNING,
														
 
															-                        "Your export has been saved with an incorrect encoding for its String fields which means it might not load correctly " +
														
 
															-                        "due to encoding issues. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
														
 
															-                );
														
 
															-                // We use ISO8859_1 to be consistent across platforms. We could default to native encoding, but this would lead to inconsistent
														
 
															-                // behaviour across platforms!
														
 
															-                // Developers that have previously saved their exports using the old exporter (which uses native encoding), can temporarly
														
 
															-                // remove the ""ISO8859_1" parameter, and change the above if statement to "if (false)".
														
 
															-                // They should then import and re-export their models using the same environment they were originally created in.
														
 
															-                return new String(bytes, "ISO8859_1");
														
 
															-            }
														
 
															-        } catch (UnsupportedEncodingException uee) {
														
 
															-            // as a last resort fall back to platform native.
														
 
															-            // JavaDoc is vague about what happens when a decoding a String that contains un undecodable sequence
														
 
															-            // it also doesn't specify which encodings have to be supported (though UTF-8 and ISO8859 have been in the SUN JRE since at least 1.1)
														
 
															-            logger.log(
														
 
															-                    Level.SEVERE,
														
 
															-                    "Your export has been saved with an incorrect encoding or your version of Java is unable to decode the stored string. " +
														
 
															-                    "While your export may load correctly by falling back, using it on different platforms or java versions might lead to "+
														
 
															-                    "very strange inconsitenties. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
														
 
															-            );
														
 
															-            return new String(bytes);
														
 
															-        }
														
 
															+        return new String(bytes, StandardCharsets.UTF_8);
														
 
															     }
														
 
															     protected String[] readStringArray(byte[] content) throws IOException {
														
@@ -1418,4 +1306,4 @@ final class BinaryInputCapsule implements InputCapsule {
 
															             return null;
														
 
															         }
														
 
															     }
														
 
															-}
														
 
															+}
														
--- a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java
+++ b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java
@@ -41,6 +41,7 @@ import java.nio.ByteBuffer;
 
															 import java.nio.FloatBuffer;
														
 
															 import java.nio.IntBuffer;
														
 
															 import java.nio.ShortBuffer;
														
 
															+import java.nio.charset.StandardCharsets;
														
 
															 import java.util.ArrayList;
														
 
															 import java.util.Arrays;
														
 
															 import java.util.BitSet;
														
@@ -684,8 +685,7 @@ final class BinaryOutputCapsule implements OutputCapsule {
 
															             write(NULL_OBJECT);
														
 
															             return;
														
 
															         }
														
 
															-        // write our output as UTF-8. Java misspells UTF-8 as UTF8 for official use in java.lang
														
 
															-        byte[] bytes = value.getBytes("UTF8");
														
 
															+        byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
														
 
															         write(bytes.length);
														
 
															         baos.write(bytes);
														
 
															     }