Browse Source

Merged in AuahDark/love-android-sdl2/clean-include (pull request #21)

Clean external dependency include paths from LOVE Android.mk
Miku AuahDark 6 years ago
parent
commit
8d71234114
65 changed files with 10027 additions and 2082 deletions
  1. 1 0
      love/src/jni/LuaJIT-2.1/Android.mk
  2. 3 2
      love/src/jni/libmodplug-0.8.8.4/Android.mk
  3. 0 0
      love/src/jni/libmodplug-0.8.8.4/config.h
  4. 4 6
      love/src/jni/libogg-1.3.2/Android.mk
  5. 45 15
      love/src/jni/libtheora-1.2.0alpha1/Android.mk
  6. 3 3
      love/src/jni/libtheora-1.2.0alpha1/config.h
  7. 31 46
      love/src/jni/libtheora-1.2.0alpha1/include/theora/codec.h
  8. 55 57
      love/src/jni/libtheora-1.2.0alpha1/include/theora/theora.h
  9. 2 5
      love/src/jni/libtheora-1.2.0alpha1/include/theora/theoradec.h
  10. 9 17
      love/src/jni/libtheora-1.2.0alpha1/include/theora/theoraenc.h
  11. 98 161
      love/src/jni/libtheora-1.2.0alpha1/lib/analyze.c
  12. 271 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/arm2gnu.pl
  13. 227 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armbits.s
  14. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armcpu.c
  15. 645 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armfrag.s
  16. 1876 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armidct.s
  17. 664 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armloop.s
  18. 39 0
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armopts.s.in
  19. 27 15
      love/src/jni/libtheora-1.2.0alpha1/lib/arm/armstate.c
  20. 0 1
      love/src/jni/libtheora-1.2.0alpha1/lib/bitpack.h
  21. 97 137
      love/src/jni/libtheora-1.2.0alpha1/lib/collect.c
  22. 4 7
      love/src/jni/libtheora-1.2.0alpha1/lib/collect.h
  23. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/decint.h
  24. 115 765
      love/src/jni/libtheora-1.2.0alpha1/lib/decode.c
  25. 19 40
      love/src/jni/libtheora-1.2.0alpha1/lib/encfrag.c
  26. 22 42
      love/src/jni/libtheora-1.2.0alpha1/lib/encint.h
  27. 1 18
      love/src/jni/libtheora-1.2.0alpha1/lib/encode.c
  28. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/enquant.c
  29. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/fdct.c
  30. 21 11
      love/src/jni/libtheora-1.2.0alpha1/lib/huffdec.c
  31. 4 1
      love/src/jni/libtheora-1.2.0alpha1/lib/huffenc.c
  32. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/huffman.h
  33. 5 6
      love/src/jni/libtheora-1.2.0alpha1/lib/idct.c
  34. 4 4
      love/src/jni/libtheora-1.2.0alpha1/lib/info.c
  35. 3 1
      love/src/jni/libtheora-1.2.0alpha1/lib/internal.c
  36. 5 12
      love/src/jni/libtheora-1.2.0alpha1/lib/internal.h
  37. 1 1
      love/src/jni/libtheora-1.2.0alpha1/lib/mathops.c
  38. 21 19
      love/src/jni/libtheora-1.2.0alpha1/lib/mathops.h
  39. 53 68
      love/src/jni/libtheora-1.2.0alpha1/lib/mcenc.c
  40. 1 503
      love/src/jni/libtheora-1.2.0alpha1/lib/modedec.h
  41. 2 11
      love/src/jni/libtheora-1.2.0alpha1/lib/rate.c
  42. 26 41
      love/src/jni/libtheora-1.2.0alpha1/lib/state.c
  43. 4 12
      love/src/jni/libtheora-1.2.0alpha1/lib/state.h
  44. 40 31
      love/src/jni/libtheora-1.2.0alpha1/lib/tokenize.c
  45. 904 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxencfrag.c
  46. 665 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxfdct.c
  47. 368 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxfrag.c
  48. 562 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxidct.c
  49. 318 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxloop.h
  50. 228 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxstate.c
  51. 498 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2encfrag.c
  52. 449 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2fdct.c
  53. 460 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2idct.c
  54. 243 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2trans.h
  55. 182 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86cpu.c
  56. 36 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86cpu.h
  57. 61 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enc.c
  58. 114 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enc.h
  59. 257 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enquant.c
  60. 124 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86int.h
  61. 95 0
      love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86state.c
  62. 5 4
      love/src/jni/libvorbis-1.3.5/Android.mk
  63. 2 11
      love/src/jni/love/Android.mk
  64. 2 3
      love/src/jni/mpg123-1.17.0/Android.mk
  65. 1 1
      love/src/jni/openal-soft/Android.mk

+ 1 - 0
love/src/jni/LuaJIT-2.1/Android.mk

@@ -4,5 +4,6 @@ include $(CLEAR_VARS)
 
 LOCAL_MODULE := libluajit
 LOCAL_SRC_FILES := android/$(TARGET_ARCH_ABI)/libluajit.a
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_PATH}/src
 
 include $(PREBUILT_STATIC_LIBRARY)

+ 3 - 2
love/src/jni/libmodplug-0.8.8.4/Android.mk

@@ -7,10 +7,11 @@ LOCAL_MODULE    := libmodplug
 LOCAL_CFLAGS    := -fexceptions -g -Dlinux -DHAVE_GCC_DESTRUCTOR=1 -DOPT_GENERIC -DREAL_IS_FLOAT
 LOCAL_CPPFLAGS  := ${LOCAL_CFLAGS}
 
-LOCAL_C_INCLUDES  :=  \
+LOCAL_C_INCLUDES := \
+	${LOCAL_PATH} \
 	${LOCAL_PATH}/src \
 	${LOCAL_PATH}/src/libmodplug
-		
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_PATH}/src
 LOCAL_SRC_FILES := \
 	$(filter-out \
   , $(subst $(LOCAL_PATH)/,,\

+ 0 - 0
love/src/jni/libmodplug-0.8.8.4/src/config.h → love/src/jni/libmodplug-0.8.8.4/config.h


+ 4 - 6
love/src/jni/libogg-1.3.2/Android.mk

@@ -7,13 +7,11 @@ LOCAL_MODULE    := libogg
 LOCAL_CFLAGS    := -fexceptions -g -Dlinux -DHAVE_GCC_DESTRUCTOR=1 -DOPT_GENERIC -DREAL_IS_FLOAT
 LOCAL_CPPFLAGS  := ${LOCAL_CFLAGS}
 
-LOCAL_C_INCLUDES  :=  \
-	${LOCAL_PATH}/include
-		
+LOCAL_C_INCLUDES := ${LOCAL_PATH}/include
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_C_INCLUDES}
 LOCAL_SRC_FILES := \
-	$(subst $(LOCAL_PATH)/,, \
-	${LOCAL_PATH}/src/framing.c \
-	${LOCAL_PATH}/src/bitwise.c )
+	src/framing.c \
+	src/bitwise.c
 
 # $(info local includes $(LOCAL_C_INCLUDES))
 

+ 45 - 15
love/src/jni/libtheora-1.2.0alpha1/Android.mk

@@ -1,26 +1,56 @@
 LOCAL_PATH:= $(call my-dir)
-
-# libtheora
 include $(CLEAR_VARS)
 
-LOCAL_MODULE    := libtheora
-LOCAL_CFLAGS    := -fexceptions -g -Dlinux -DHAVE_GCC_DESTRUCTOR=1 -DOPT_GENERIC -DREAL_IS_FLOAT
-LOCAL_CPPFLAGS  := ${LOCAL_CFLAGS}
+LOCAL_MODULE := libtheora
+LOCAL_CFLAGS := -g -Wno-parentheses
 
-LOCAL_C_INCLUDES  :=  \
+LOCAL_C_INCLUDES := \
 	${LOCAL_PATH}/include \
-	${LOCAL_PATh}/lib/arm \
 	${LOCAL_PATh}/lib \
 	${LOCAL_PATH}/../libogg-1.3.2/include
-
-		
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_PATH}/include
 LOCAL_SRC_FILES := \
-	$(filter-out \
-	,$(subst $(LOCAL_PATH)/,,\
-	$(wildcard ${LOCAL_PATH}/lib/*.c) \
-  ))
+	lib/apiwrapper.c \
+	lib/bitpack.c \
+	lib/decapiwrapper.c \
+	lib/decinfo.c \
+	lib/decode.c \
+	lib/dequant.c \
+	lib/fragment.c \
+	lib/huffdec.c \
+	lib/idct.c \
+	lib/info.c \
+	lib/internal.c \
+	lib/quant.c \
+	lib/state.c \
+	lib/encoder_disabled.c
 
-# $(info local includes $(LOCAL_C_INCLUDES))
+LOCAL_ARM_NEON := true
 
-include $(BUILD_STATIC_LIBRARY)
+# Target conditionals
+# It seems unfortunate that ARM codepath can't be used in here.
+ifeq ($(TARGET_ARCH_ABI),x86)
+	# Defines for x86. UNTESTED!
+	LOCAL_CFLAGS += -DOC_X86_ASM
+	LOCAL_SRC_FILES += \
+		lib/x86/x86cpu.c \
+		lib/x86/mmxidct.c \
+		lib/x86/mmxfrag.c \
+		lib/x86/mmxstate.c \
+		lib/x86/sse2idct.c \
+		lib/x86/x86state.c
+else ifeq ($(TARGET_ARCH_ABI),x86_64)
+	# Defines for x86-64. UNTESTED!
+	LOCAL_CFLAGS += -DOC_X86_ASM -DOC_X86_64_ASM
+	LOCAL_SRC_FILES += \
+		lib/x86/x86cpu.c \
+		lib/x86/mmxidct.c \
+		lib/x86/mmxfrag.c \
+		lib/x86/mmxstate.c \
+		lib/x86/sse2idct.c \
+		lib/x86/x86state.c
+endif
 
+LOCAL_STATIC_LIBRARIES := libogg
+
+include $(BUILD_STATIC_LIBRARY)

+ 3 - 3
love/src/jni/libtheora-1.2.0alpha1/config.h

@@ -48,13 +48,13 @@
 #define LT_OBJDIR ".libs/"
 
 /* make use of arm asm optimization */
-#define OC_ARM_ASM /**/
+/* #undef OC_ARM_ASM */
 
 /* Define if assembler supports EDSP instructions */
-#define OC_ARM_ASM_EDSP 1
+/* #undef OC_ARM_ASM_EDSP 1 */
 
 /* Define if assembler supports ARMv6 media instructions */
-#define OC_ARM_ASM_MEDIA 1
+/* #undef OC_ARM_ASM_MEDIA 1 */
 
 /* Define if compiler supports NEON instructions */
 /* #undef OC_ARM_ASM_NEON */

+ 31 - 46
love/src/jni/libtheora-1.2.0alpha1/include/theora/codec.h

@@ -16,12 +16,11 @@
  ********************************************************************/
 
 /**\mainpage
- *
+ * 
  * \section intro Introduction
  *
- * This is the documentation for the <tt>libtheora</tt> C API.
- *
- * The \c libtheora package is the current reference
+ * This is the documentation for <tt>libtheora</tt> C API.
+ * The current reference
  * implementation for <a href="http://www.theora.org/">Theora</a>, a free,
  * patent-unencumbered video codec.
  * Theora is derived from On2's VP3 codec with additional features and
@@ -31,31 +30,29 @@
  * <a href="http://www.theora.org/doc/Theora.pdf">the Theora
  *  specification</a>.
  *
- * \section Organization
+ * \subsection Organization
  *
- * The functions documented here are divided between two
+ * The functions documented here are actually subdivided into three 
  * separate libraries:
- * - \c libtheoraenc contains the encoder interface,
+ * - <tt>libtheoraenc</tt> contains the encoder interface,
  *   described in \ref encfuncs.
- * - \c libtheoradec contains the decoder interface,
- *   described in \ref decfuncs, \n
- *   and additional \ref basefuncs.
- *
- * New code should link to \c libtheoradec. If using encoder
- * features, it must also link to \c libtheoraenc.
+ * - <tt>libtheoradec</tt> contains the decoder interface and
+ *   routines shared with the encoder.
+ *   You must also link to this if you link to <tt>libtheoraenc</tt>.
+ *   The routines in this library are described in \ref decfuncs and 
+ *   \ref basefuncs.
+ * - <tt>libtheora</tt> contains the \ref oldfuncs.
  *
- * During initial development, prior to the 1.0 release,
- * \c libtheora exported a different \ref oldfuncs which
- * combined both encode and decode functions.
- * In general, legacy API symbols can be indentified
- * by their \c theora_ or \c OC_ namespace prefixes.
- * The current API uses \c th_ or \c TH_ instead.
+ * New code should link to <tt>libtheoradec</tt> and, if using encoder
+ * features, <tt>libtheoraenc</tt>. Together these two export both
+ * the standard and the legacy API, so this is all that is needed by
+ * any code. The older <tt>libtheora</tt> library is provided just for
+ * compatibility with older build configurations.
  *
- * While deprecated, \c libtheoraenc and \c libtheoradec
- * together export the legacy api as well at the one documented above.
- * Likewise, the legacy \c libtheora included with this package
- * exports the new 1.x API. Older code and build scripts can therefore
- * but updated independently to the current scheme.
+ * In general the recommended 1.x API symbols can be distinguished
+ * by their <tt>th_</tt> or <tt>TH_</tt> namespace prefix.
+ * The older, legacy API uses <tt>theora_</tt> or <tt>OC_</tt>
+ * prefixes instead.
  */
 
 /**\file
@@ -171,7 +168,7 @@ typedef struct{
 typedef th_img_plane th_ycbcr_buffer[3];
 
 /**Theora bitstream information.
- * This contains the basic playback parameters for a stream, and corresponds to
+ * This contains the basic playback parameters for a stream, and corresponds to 
  *  the initial 'info' header packet.
  * To initialize an encoder, the application fills in this structure and
  *  passes it to th_encode_alloc().
@@ -320,7 +317,7 @@ typedef struct{
  * In filling in this structure, th_decode_headerin() will null-terminate
  *  the user_comment strings for safety.
  * However, the bitstream format itself treats them as 8-bit clean vectors,
- *  possibly containing null characters, so the length array should be
+ *  possibly containing null characters, and so the length array should be
  *  treated as their authoritative length.
  */
 typedef struct th_comment{
@@ -451,13 +448,7 @@ typedef struct{
 
 /**\defgroup basefuncs Functions Shared by Encode and Decode*/
 /*@{*/
-/**\name Basic shared functions
- * These functions return information about the library itself,
- * or provide high-level information about codec state
- * and packet type.
- *
- * You must link to \c libtheoradec if you use any of the
- * functions in this section.*/
+/**\name Basic shared functions*/
 /*@{*/
 /**Retrieves a human-readable string to identify the library vendor and
  *  version.
@@ -519,12 +510,7 @@ extern int th_packet_iskeyframe(ogg_packet *_op);
 /*@}*/
 
 
-/**\name Functions for manipulating header data
- * These functions manipulate the #th_info and #th_comment structures
- * which describe video parameters and key-value metadata, respectively.
- *
- * You must link to \c libtheoradec if you use any of the
- * functions in this section.*/
+/**\name Functions for manipulating header data*/
 /*@{*/
 /**Initializes a th_info structure.
  * This should be called on a freshly allocated #th_info structure before
@@ -551,7 +537,7 @@ extern void th_comment_init(th_comment *_tc);
  * \param _tc      The #th_comment struct to add the comment to.
  * \param _comment Must be a null-terminated UTF-8 string containing the
  *                  comment in "TAG=the value" form.*/
-extern void th_comment_add(th_comment *_tc,const char *_comment);
+extern void th_comment_add(th_comment *_tc, char *_comment);
 /**Add a comment to an initialized #th_comment structure.
  * \note Neither th_comment_add() nor th_comment_add_tag() support
  *  comments containing null values, although the bitstream format does
@@ -559,11 +545,10 @@ extern void th_comment_add(th_comment *_tc,const char *_comment);
  * To add such comments you will need to manipulate the #th_comment
  *  structure directly.
  * \param _tc  The #th_comment struct to add the comment to.
- * \param _tag A null-terminated string containing the tag associated with
+ * \param _tag A null-terminated string containing the tag  associated with
  *              the comment.
  * \param _val The corresponding value as a null-terminated string.*/
-extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
- const char *_val);
+extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val);
 /**Look up a comment value by its tag.
  * \param _tc    An initialized #th_comment structure.
  * \param _tag   The tag to look up.
@@ -579,15 +564,15 @@ extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
  *         It should not be modified or freed by the application, and
  *          modifications to the structure may invalidate the pointer.
  * \retval NULL If no matching tag is found.*/
-extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
+extern char *th_comment_query(th_comment *_tc,char *_tag,int _count);
 /**Look up the number of instances of a tag.
  * Call this first when querying for a specific tag and then iterate over the
  *  number of instances with separate calls to th_comment_query() to
  *  retrieve all the values for that tag in order.
  * \param _tc    An initialized #th_comment structure.
  * \param _tag   The tag to look up.
- * \return The number of instances of this particular tag.*/
-extern int th_comment_query_count(th_comment *_tc,const char *_tag);
+ * \return The number on instances of this particular tag.*/
+extern int th_comment_query_count(th_comment *_tc,char *_tag);
 /**Clears a #th_comment structure.
  * This should be called on a #th_comment structure after it is no longer
  *  needed.

+ 55 - 57
love/src/jni/libtheora-1.2.0alpha1/include/theora/theora.h

@@ -34,41 +34,41 @@ extern "C"
  *
  * \section intro Introduction
  *
- * This is the documentation for the libtheora legacy C API, declared in
+ * This is the documentation for the libtheora legacy C API, declared in 
  * the theora.h header, which describes the old interface used before
  * the 1.0 release. This API was widely deployed for several years and
- * remains supported, but for new code we recommend the cleaner API
+ * remains supported, but for new code we recommend the cleaner API 
  * declared in theoradec.h and theoraenc.h.
  *
  * libtheora is the reference implementation for
  * <a href="http://www.theora.org/">Theora</a>, a free video codec.
  * Theora is derived from On2's VP3 codec with improved integration with
  * Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
- *
+ * 
  * \section overview Overview
  *
- * This library will both decode and encode theora packets to/from raw YUV
+ * This library will both decode and encode theora packets to/from raw YUV 
  * frames.  In either case, the packets will most likely either come from or
- * need to be embedded in an Ogg stream.  Use
- * <a href="http://xiph.org/ogg/">libogg</a> or
+ * need to be embedded in an Ogg stream.  Use 
+ * <a href="http://xiph.org/ogg/">libogg</a> or 
  * <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
  * to extract/package these packets.
  *
  * \section decoding Decoding Process
  *
  * Decoding can be separated into the following steps:
- * -# initialise theora_info and theora_comment structures using
+ * -# initialise theora_info and theora_comment structures using 
  *    theora_info_init() and theora_comment_init():
  \verbatim
  theora_info     info;
  theora_comment  comment;
-
+   
  theora_info_init(&info);
  theora_comment_init(&comment);
  \endverbatim
- * -# retrieve header packets from Ogg stream (there should be 3) and decode
- *    into theora_info and theora_comment structures using
- *    theora_decode_header().  See \ref identification for more information on
+ * -# retrieve header packets from Ogg stream (there should be 3) and decode 
+ *    into theora_info and theora_comment structures using 
+ *    theora_decode_header().  See \ref identification for more information on 
  *    identifying which packets are theora packets.
  \verbatim
  int i;
@@ -79,14 +79,14 @@ extern "C"
  }
  \endverbatim
  * -# initialise the decoder based on the information retrieved into the
- *    theora_info struct by theora_decode_header().  You will need a
+ *    theora_info struct by theora_decode_header().  You will need a 
  *    theora_state struct.
  \verbatim
  theora_state state;
-
+ 
  theora_decode_init(&state, &info);
  \endverbatim
- * -# pass in packets and retrieve decoded frames!  See the yuv_buffer
+ * -# pass in packets and retrieve decoded frames!  See the yuv_buffer 
  *    documentation for information on how to retrieve raw YUV data.
  \verbatim
  yuf_buffer buffer;
@@ -96,20 +96,20 @@ extern "C"
    theora_decode_YUVout(&state, &buffer);
  }
  \endverbatim
- *
+ *  
  *
  * \subsection identification Identifying Theora Packets
  *
- * All streams inside an Ogg file have a unique serial_no attached to the
- * stream.  Typically, you will want to
- *  - retrieve the serial_no for each b_o_s (beginning of stream) page
- *    encountered within the Ogg file;
- *  - test the first (only) packet on that page to determine if it is a theora
+ * All streams inside an Ogg file have a unique serial_no attached to the 
+ * stream.  Typically, you will want to 
+ *  - retrieve the serial_no for each b_o_s (beginning of stream) page 
+ *    encountered within the Ogg file; 
+ *  - test the first (only) packet on that page to determine if it is a theora 
  *    packet;
- *  - once you have found a theora b_o_s page then use the retrieved serial_no
+ *  - once you have found a theora b_o_s page then use the retrieved serial_no 
  *    to identify future packets belonging to the same theora stream.
- *
- * Note that you \e cannot use theora_packet_isheader() to determine if a
+ * 
+ * Note that you \e cannot use theora_packet_isheader() to determine if a 
  * packet is a theora packet or not, as this function does not perform any
  * checking beyond whether a header bit is present.  Instead, use the
  * theora_decode_header() function and check the return value; or examine the
@@ -124,9 +124,9 @@ extern "C"
  * A YUV buffer for passing uncompressed frames to and from the codec.
  * This holds a Y'CbCr frame in planar format. The CbCr planes can be
  * subsampled and have their own separate dimensions and row stride
- * offsets. Note that the strides may be negative in some
+ * offsets. Note that the strides may be negative in some 
  * configurations. For theora the width and height of the largest plane
- * must be a multiple of 16. The actual meaningful picture size and
+ * must be a multiple of 16. The actual meaningful picture size and 
  * offset are stored in the theora_info structure; frames returned by
  * the decoder may need to be cropped for display.
  *
@@ -135,8 +135,8 @@ extern "C"
  * are ordered from left to right.
  *
  * During decode, the yuv_buffer struct is allocated by the user, but all
- * fields (including luma and chroma pointers) are filled by the library.
- * These pointers address library-internal memory and their contents should
+ * fields (including luma and chroma pointers) are filled by the library.  
+ * These pointers address library-internal memory and their contents should 
  * not be modified.
  *
  * Conversely, during encode the user allocates the struct and fills out all
@@ -186,7 +186,7 @@ typedef enum {
  * Theora bitstream info.
  * Contains the basic playback parameters for a stream,
  * corresponding to the initial 'info' header packet.
- *
+ * 
  * Encoded theora frames must be a multiple of 16 in width and height.
  * To handle other frame sizes, a crop rectangle is specified in
  * frame_height and frame_width, offset_x and * offset_y. The offset
@@ -198,10 +198,10 @@ typedef enum {
  * fraction. Aspect ratio is also stored as a rational fraction, and
  * refers to the aspect ratio of the frame pixels, not of the
  * overall frame itself.
- *
+ * 
  * See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
  * examples/encoder_example.c</a> for usage examples of the
- * other parameters and good default settings for the encoder parameters.
+ * other paramters and good default settings for the encoder parameters.
  */
 typedef struct {
   ogg_uint32_t  width;		/**< encoded frame width  */
@@ -253,14 +253,14 @@ typedef struct{
 
 } theora_state;
 
-/**
+/** 
  * Comment header metadata.
  *
  * This structure holds the in-stream metadata corresponding to
  * the 'comment' header packet.
  *
  * Meta data is stored as a series of (tag, value) pairs, in
- * length-encoded string vectors. The first occurence of the
+ * length-encoded string vectors. The first occurence of the 
  * '=' character delimits the tag and value. A particular tag
  * may occur more than once. The character set encoding for
  * the strings is always UTF-8, but the tag names are limited
@@ -285,7 +285,7 @@ typedef struct theora_comment{
 /* \anchor decctlcodes_old
  * These are the available request codes for theora_control()
  * when called with a decoder instance.
- * By convention decoder control codes are odd, to distinguish
+ * By convention decoder control codes are odd, to distinguish 
  * them from \ref encctlcodes_old "encoder control codes" which
  * are even.
  *
@@ -306,7 +306,7 @@ typedef struct theora_comment{
 #define TH_DECCTL_GET_PPLEVEL_MAX (1)
 
 /**Set the post-processing level.
- * Sets the level of post-processing to use when decoding the
+ * Sets the level of post-processing to use when decoding the 
  * compressed stream. This must be a value between zero (off)
  * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
  */
@@ -345,9 +345,9 @@ typedef struct theora_comment{
  * \param[in] buf #th_quant_info
  * \retval OC_FAULT  \a theora_state is <tt>NULL</tt>.
  * \retval OC_EINVAL Encoding has already begun, the quantization parameters
- *                    are not acceptable to this version of the encoder,
- *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
- *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is
+ *                    are not acceptable to this version of the encoder, 
+ *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero, 
+ *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is 
  *                    not <tt>sizeof(#th_quant_info)</tt>.
  * \retval OC_IMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_QUANT_PARAMS (2)
@@ -424,7 +424,7 @@ typedef struct theora_comment{
 #define OC_NEWPACKET   -25      /**< Packet is an (ignorable) unhandled extension */
 #define OC_DUPFRAME    1        /**< Packet is a dropped frame */
 
-/**
+/** 
  * Retrieve a human-readable string to identify the encoder vendor and version.
  * \returns A version string.
  */
@@ -462,7 +462,7 @@ extern int theora_encode_init(theora_state *th, theora_info *ti);
 extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
 
 /**
- * Request the next packet of encoded video.
+ * Request the next packet of encoded video. 
  * The encoded data is placed in a user-provided ogg_packet structure.
  * \param t A theora_state handle previously initialized for encoding.
  * \param last_p whether this is the last packet the encoder should produce.
@@ -496,11 +496,7 @@ extern int theora_encode_header(theora_state *t, ogg_packet *op);
  * \param op An ogg_packet structure to fill. libtheora will set all
  *           elements of this structure, including a pointer to the encoded
  *           comment data. The memory for the comment data is owned by
- *           the application, and must be freed by it using _ogg_free().
- *           On some systems (such as Windows when using dynamic linking), this
- *           may mean the free is executed in a different module from the
- *           malloc, which will crash; there is no way to free this memory on
- *           such systems.
+ *           libtheora.
  * \retval 0 Success
  */
 extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
@@ -585,8 +581,8 @@ extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
  * \param th A theora_state handle previously initialized for decoding.
  * \param yuv A yuv_buffer in which libtheora should place the decoded data.
  *            Note that the buffer struct itself is allocated by the user, but
- *            that the luma and chroma pointers will be filled in by the
- *            library.  Also note that these luma and chroma regions should be
+ *            that the luma and chroma pointers will be filled in by the 
+ *            library.  Also note that these luma and chroma regions should be 
  *            considered read-only by the user.
  * \retval 0 Success
  */
@@ -621,22 +617,22 @@ extern int theora_packet_iskeyframe(ogg_packet *op);
 /**
  * Report the granulepos shift radix
  *
- * When embedded in Ogg, Theora uses a two-part granulepos,
+ * When embedded in Ogg, Theora uses a two-part granulepos, 
  * splitting the 64-bit field into two pieces. The more-significant
  * section represents the frame count at the last keyframe,
  * and the less-significant section represents the count of
  * frames since the last keyframe. In this way the overall
  * field is still non-decreasing with time, but usefully encodes
  * a pointer to the last keyframe, which is necessary for
- * correctly restarting decode after a seek.
+ * correctly restarting decode after a seek. 
  *
  * This function reports the number of bits used to represent
  * the distance to the last keyframe, and thus how the granulepos
  * field must be shifted or masked to obtain the two parts.
- *
+ * 
  * Since libtheora returns compressed data in an ogg_packet
  * structure, this may be generally useful even if the Theora
- * packets are not being used in an Ogg container.
+ * packets are not being used in an Ogg container. 
  *
  * \param ti A previously initialized theora_info struct
  * \returns The bit shift dividing the two granulepos fields
@@ -648,7 +644,7 @@ int theora_granule_shift(theora_info *ti);
 /**
  * Convert a granulepos to an absolute frame index, starting at 0.
  * The granulepos is interpreted in the context of a given theora_state handle.
- *
+ * 
  * Note that while the granulepos encodes the frame count (i.e. starting
  * from 1) this call returns the frame index, starting from zero. Thus
  * One can calculate the presentation time by multiplying the index by
@@ -674,7 +670,9 @@ extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos)
  *          This is the "end time" for the frame, or the latest time it should
  *           be displayed.
  *          It is not the presentation time.
- * \retval -1. The given granulepos is undefined (i.e. negative).
+ * \retval -1. The given granulepos is undefined (i.e. negative), or
+ * \retval -1. The function has been disabled because floating 
+ *              point support is not available.
  */
 extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
 
@@ -701,7 +699,7 @@ extern void theora_clear(theora_state *t);
 
 /**
  * Initialize an allocated theora_comment structure
- * \param tc An allocated theora_comment structure
+ * \param tc An allocated theora_comment structure 
  **/
 extern void theora_comment_init(theora_comment *tc);
 
@@ -722,7 +720,7 @@ extern void theora_comment_add(theora_comment *tc, char *comment);
 /**
  * Add a comment to an initialized theora_comment structure.
  * \param tc A previously initialized theora comment structure
- * \param tag A null-terminated string containing the tag
+ * \param tag A null-terminated string containing the tag 
  *            associated with the comment.
  * \param value The corresponding value as a null-terminated string
  *
@@ -754,9 +752,9 @@ extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
  *  \param tc An initialized theora_comment structure
  *  \param tag The tag to look up
  *  \returns The number on instances of a particular tag.
- *
+ * 
  *  Call this first when querying for a specific tag and then interate
- *  over the number of instances with separate calls to
+ *  over the number of instances with separate calls to 
  *  theora_comment_query() to retrieve all instances in order.
  **/
 extern int   theora_comment_query_count(theora_comment *tc, char *tag);
@@ -771,7 +769,7 @@ extern void  theora_comment_clear(theora_comment *tc);
  * This is used to provide advanced control the encoding process.
  * \param th     A #theora_state handle.
  * \param req    The control code to process.
- *                See \ref encctlcodes_old "the list of available
+ *                See \ref encctlcodes_old "the list of available 
  *			control codes" for details.
  * \param buf    The parameters for this control code.
  * \param buf_sz The size of the parameter buffer.*/

+ 2 - 5
love/src/jni/libtheora-1.2.0alpha1/include/theora/theoradec.h

@@ -171,7 +171,7 @@ typedef struct th_setup_info th_setup_info;
 /**\defgroup decfuncs Functions for Decoding*/
 /*@{*/
 /**\name Functions for decoding
- * You must link to <tt>libtheoradec</tt> if you use any of the
+ * You must link to <tt>libtheoradec</tt> if you use any of the 
  * functions in this section.
  *
  * The functions are listed in the order they are used in a typical decode.
@@ -267,10 +267,7 @@ extern void th_setup_free(th_setup_info *_setup);
  *                See \ref decctlcodes "the list of available control codes"
  *                 for details.
  * \param _buf    The parameters for this control code.
- * \param _buf_sz The size of the parameter buffer.
- * \return Possible return values depend on the control code used.
- *          See \ref decctlcodes "the list of control codes" for
- *          specific values. Generally 0 indicates success.*/
+ * \param _buf_sz The size of the parameter buffer.*/
 extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
  size_t _buf_sz);
 /**Submits a packet containing encoded video data to the decoder.

+ 9 - 17
love/src/jni/libtheora-1.2.0alpha1/include/theora/theoraenc.h

@@ -58,7 +58,7 @@ extern "C" {
  *
  * \param[in] _buf #th_quant_info
  * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
- * \retval TH_EINVAL Encoding has already begun, \a _buf is
+ * \retval TH_EINVAL Encoding has already begun, \a _buf is 
  *                    <tt>NULL</tt> and \a _buf_sz is not zero,
  *                    or \a _buf is non-<tt>NULL</tt> and
  *                    \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
@@ -330,11 +330,7 @@ extern "C" {
  * \retval 0             Success.
  * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
  * \retval TH_EINVAL     The target bitrate was not positive.
- *                       A future version of this library may allow passing 0
- *                        to disabled rate-controlled mode and return to a
- *                        quality-based mode, in which case this function will
- *                        not return an error for that value.
- * \retval TH_EIMPL      Not supported by this implementation.*/
+ * \retval TH_EIMPL       Not supported by this implementation.*/
 #define TH_ENCCTL_SET_BITRATE (30)
 /**Sets the configuration to be compatible with that from the given setup
  *  header.
@@ -386,8 +382,7 @@ extern "C" {
 /*@{*/
 /**Drop frames to keep within bitrate buffer constraints.
  * This can have a severe impact on quality, but is the only way to ensure that
- *  bitrate targets are met at low rates during sudden bursts of activity.
- * It is enabled by default.*/
+ *  bitrate targets are met at low rates during sudden bursts of activity.*/
 #define TH_RATECTL_DROP_FRAMES   (0x1)
 /**Ignore bitrate buffer overflows.
  * If the encoder uses so few bits that the reservoir of available bits
@@ -395,14 +390,14 @@ extern "C" {
  * The encoder will not try to use these extra bits in future frames.
  * At high rates this may cause the result to be undersized, but allows a
  *  client to play the stream using a finite buffer; it should normally be
- *  enabled, which is the default.*/
+ *  enabled.*/
 #define TH_RATECTL_CAP_OVERFLOW  (0x2)
 /**Ignore bitrate buffer underflows.
  * If the encoder uses so many bits that the reservoir of available bits
  *  underflows, ignore the deficit.
  * The encoder will not try to make up these extra bits in future frames.
  * At low rates this may cause the result to be oversized; it should normally
- *  be disabled, which is the default.*/
+ *  be disabled.*/
 #define TH_RATECTL_CAP_UNDERFLOW (0x4)
 /*@}*/
 
@@ -446,8 +441,8 @@ typedef struct th_enc_ctx    th_enc_ctx;
  *    packets.
  * - For each uncompressed frame:
  *   - Submit the uncompressed frame via th_encode_ycbcr_in()
- *   - Repeatedly call th_encode_packetout() to retrieve any video
- *      data packets that are ready.
+ *   - Repeatedly call th_encode_packetout() to retrieve any video data packets
+ *      that are ready.
  * - Call th_encode_free() to release all encoder memory.*/
 /*@{*/
 /**Allocates an encoder instance.
@@ -462,10 +457,7 @@ extern th_enc_ctx *th_encode_alloc(const th_info *_info);
  *                See \ref encctlcodes "the list of available control codes"
  *                 for details.
  * \param _buf    The parameters for this control code.
- * \param _buf_sz The size of the parameter buffer.
- * \return Possible return values depend on the control code used.
- *          See \ref encctlcodes "the list of control codes" for
- *          specific values. Generally 0 indicates success.*/
+ * \param _buf_sz The size of the parameter buffer.*/
 extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
 /**Outputs the next header packet.
  * This should be called repeatedly after encoder initialization until it
@@ -500,7 +492,7 @@ extern int th_encode_flushheader(th_enc_ctx *_enc,
  *                picture offsets may require an unexpected chroma plane size,
  *                and their use is generally discouraged, as they will not be
  *                well-supported by players and other media frameworks.
- *               See Section 4.4 of
+ *               See Section 4.4 of 
  *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
  *                specification</a> for details if you wish to use them anyway.
  * \retval 0         Success.

+ 98 - 161
love/src/jni/libtheora-1.2.0alpha1/lib/analyze.c

@@ -572,8 +572,6 @@ static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
   flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
   _pipe->loop_filter=flimit!=0;
   if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
-  /*Clear the temporary DCT scratch space.*/
-  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
 }
 
 /*Sets the current MCU stripe to super block row _sby.
@@ -612,13 +610,14 @@ static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
 
 static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
+  int refi;
   /*Copy over all the uncoded fragments from this plane and advance the uncoded
      fragment list.*/
   if(_pipe->nuncoded_fragis[_pli]>0){
     _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
     oc_frag_copy_list(&_enc->state,
-     _enc->state.ref_frame_data[OC_FRAME_SELF],
-     _enc->state.ref_frame_data[OC_FRAME_PREV],
+     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]],
+     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]],
      _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
      _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
     _pipe->nuncoded_fragis[_pli]=0;
@@ -637,18 +636,17 @@ static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
   _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
   _pipe->ncoded_fragis[_pli]=0;
   /*Apply the loop filter if necessary.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   if(_pipe->loop_filter){
-    oc_state_loop_filter_frag_rows(&_enc->state,
-     _pipe->bounding_values,OC_FRAME_SELF,_pli,
-     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+    oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
+     refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
   }
   else _sdelay=_edelay=0;
   /*To fill borders, we have an additional two pixel delay, since a fragment
      in the next row could filter its top edge, using two pixels from a
      fragment in this row.
     But there's no reason to delay a full fragment between the two.*/
-  oc_state_borders_fill_rows(&_enc->state,
-   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
+  oc_state_borders_fill_rows(&_enc->state,refi,_pli,
    (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
    (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
 }
@@ -669,9 +667,8 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
  unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
  oc_fr_state *_fr,oc_token_checkpoint **_stack){
-  ogg_int16_t            *data;
   ogg_int16_t            *dct;
-  ogg_int16_t            *idct;
+  ogg_int16_t            *data;
   oc_qii_state            qs;
   const ogg_uint16_t     *dequant;
   ogg_uint16_t            dequant_dc;
@@ -686,7 +683,6 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
   oc_token_checkpoint    *checkpoint;
   oc_fragment            *frags;
   int                     mb_mode;
-  int                     refi;
   int                     mv_offs[2];
   int                     nmv_offs;
   int                     ac_bits;
@@ -699,29 +695,30 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]]
+   +frag_offs;
   borderi=frags[_fragi].borderi;
   qii=frags[_fragi].qii;
   data=_enc->pipe.dct_data;
   dct=data+64;
-  idct=data+128;
   if(qii&~3){
 #if !defined(OC_COLLECT_METRICS)
     if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
       /*Enable early skip detection.*/
       frags[_fragi].coded=0;
-      frags[_fragi].refi=OC_FRAME_NONE;
       oc_fr_skip_block(_fr);
       return 0;
     }
 #endif
     /*Try and code this block anyway.*/
     qii&=3;
+    frags[_fragi].qii=qii;
   }
-  refi=frags[_fragi].refi;
   mb_mode=frags[_fragi].mb_mode;
-  ref=_enc->state.ref_frame_data[refi]+frag_offs;
-  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
+  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
+   +frag_offs;
   /*Motion compensation:*/
   switch(mb_mode){
     case OC_MODE_INTRA:{
@@ -736,7 +733,7 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
     }break;
     default:{
       const oc_mv *frag_mvs;
-      frag_mvs=_enc->state.frag_mvs;
+      frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
       nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
        _pli,frag_mvs[_fragi]);
       if(nmv_offs>1){
@@ -749,25 +746,19 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
   }
 #if defined(OC_COLLECT_METRICS)
   {
-    unsigned sad;
     unsigned satd;
+    unsigned dc;
     switch(nmv_offs){
-      case 0:{
-        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
-        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
-      }break;
+      case 0:satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);break;
       case 1:{
-        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
         satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
-        satd+=abs(dc);
+        satd+=dc;
       }break;
       default:{
-        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
         satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
-        satd+=abs(dc);
+        satd+=dc;
       }break;
     }
-    _enc->frag_sad[_fragi]=sad;
     _enc->frag_satd[_fragi]=satd;
   }
 #endif
@@ -781,12 +772,12 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
   /*Tokenize.*/
   checkpoint=*_stack;
   if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
-    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
-     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   }
   else{
-    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
-     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   }
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
@@ -808,11 +799,9 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
     else if(qi01>=0)qii=0;
   }
   else{
-    idct[0]=dc*dequant_dc;
-    /*Note: This clears idct[] back to zero for the next block.*/
-    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
+    data[0]=dc*dequant_dc;
+    oc_idct8x8(&_enc->state,data,data,nonzero+1);
   }
-  frags[_fragi].qii=qii;
   if(nqis>1){
     oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
     ac_bits+=qs.bits-_pipe->qs[_pli].bits;
@@ -862,7 +851,6 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
         oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
         *_stack=checkpoint;
         frags[_fragi].coded=0;
-        frags[_fragi].refi=OC_FRAME_NONE;
         oc_fr_skip_block(_fr);
         return 0;
       }
@@ -899,7 +887,6 @@ static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
   oc_fr_state          fr_checkpoint;
   oc_qii_state         qs_checkpoint;
   int                  mb_mode;
-  int                  refi;
   int                  ncoded;
   ptrdiff_t            fragi;
   int                  bi;
@@ -913,13 +900,11 @@ static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
   uncoded_fragis=_pipe->uncoded_fragis[0];
   nuncoded_fragis=_pipe->nuncoded_fragis[0];
   mb_mode=mb_modes[_mbi];
-  refi=OC_FRAME_FOR_MODE(mb_mode);
   ncoded=0;
   stackptr=stack;
   memset(&mo,0,sizeof(mo));
   for(bi=0;bi<4;bi++){
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
-    frags[fragi].refi=refi;
     frags[fragi].mb_mode=mb_mode;
     if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
      _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
@@ -945,7 +930,6 @@ static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
         if(frags[fragi].coded){
           *(uncoded_fragis-++nuncoded_fragis)=fragi;
           frags[fragi].coded=0;
-          frags[fragi].refi=OC_FRAME_NONE;
         }
         oc_fr_skip_block(_pipe->fr+0);
       }
@@ -1067,11 +1051,6 @@ static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
  +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
 
 static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
-#if !defined(OC_COLLECT_METRICS)
-  const
-#endif
-  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
-   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
   int qii;
 #if defined(OC_COLLECT_METRICS)
   oc_enc_mode_metrics_load(_enc);
@@ -1098,15 +1077,15 @@ static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
         dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
         dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
         if(dq==0)dq=1;
-        for(bin=0;bin<OC_COMP_BINS;bin++){
+        for(bin=0;bin<OC_SAD_BINS;bin++){
           int y0;
           int z0;
           int dy;
           int dz;
-          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
-          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
-          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
-          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
+          y0=OC_MODE_RD[modeline][pli][qti][bin].rate;
+          z0=OC_MODE_RD[modeline][pli][qti][bin].rmse;
+          dy=OC_MODE_RD[modeline+1][pli][qti][bin].rate-y0;
+          dz=OC_MODE_RD[modeline+1][pli][qti][bin].rmse-z0;
           _enc->mode_rd[qii][pli][qti][bin].rate=
            (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
           _enc->mode_rd[qii][pli][qti][bin].rmse=
@@ -1122,7 +1101,6 @@ static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
 static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
  int _qii,int _pli,int _qti,int _satd){
   unsigned rmse;
-  int      shift;
   int      bin;
   int      dx;
   int      y0;
@@ -1132,16 +1110,15 @@ static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
   /*SATD metrics for chroma planes vary much less than luma, so we scale them
      by 4 to distribute them into the mode decision bins more evenly.*/
   _satd<<=_pli+1&2;
-  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
-  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
-  dx=_satd-(bin<<shift);
+  bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
+  dx=_satd-(bin<<OC_SAD_SHIFT);
   y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
   z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
   dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
   dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
-  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
+  rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
   *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
-  return OC_MAXI(y0+(dy*dx>>shift),0);
+  return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
 }
 
 /*activity_avg must be positive, or flat regions could get a zero weight, which
@@ -1152,17 +1129,17 @@ static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
 
 static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
  unsigned _activity[4]){
-  const unsigned char *src;
-  const ptrdiff_t     *frag_buf_offs;
-  const ptrdiff_t     *sb_map;
-  unsigned             luma;
-  int                  ystride;
-  ptrdiff_t            frag_offs;
-  ptrdiff_t            fragi;
-  int                  bi;
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  unsigned               luma;
+  int                    ystride;
+  ptrdiff_t              frag_offs;
+  ptrdiff_t              fragi;
+  int                    bi;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
   ystride=_enc->state.ref_ystride[0];
   luma=0;
   for(bi=0;bi<4;bi++){
@@ -1237,8 +1214,8 @@ static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
   return luma;
 }
 
-static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
- unsigned _activity[4],const unsigned _intra_satd[12]){
+static unsigned oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4], unsigned _intra_satd[12]){
   int bi;
   for(bi=0;bi<4;bi++){
     unsigned act;
@@ -1249,6 +1226,9 @@ static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
     }
     _activity[bi]=act;
   }
+  /*TODO: Once frag_intra_satd returns the signed DC value instead
+     of the absolute value, this should pass it through.*/
+  return 1;
 }
 
 /*Compute the masking scales for the blocks in a macro block.
@@ -1358,7 +1338,7 @@ static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
   return activity_sum;
 }
 
-static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
  unsigned _frag_satd[12]){
   const unsigned char   *src;
   const ptrdiff_t       *frag_buf_offs;
@@ -1373,18 +1353,15 @@ static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
   int                    bi;
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
-  unsigned               luma;
-  int                    dc;
+  unsigned               dc;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
   ystride=_enc->state.ref_ystride[0];
-  luma=0;
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
     frag_offs=frag_buf_offs[fragi];
     _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-    luma+=dc;
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
@@ -1399,7 +1376,6 @@ static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
     frag_offs=frag_buf_offs[fragi];
     _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
   }
-  return luma;
 }
 
 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
@@ -1417,7 +1393,7 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
   unsigned             rate[4][3];
   int                  prev[3][3];
   unsigned             satd;
-  int                  dc;
+  unsigned             dc;
   unsigned             best_cost;
   unsigned             best_ssd;
   unsigned             best_rate;
@@ -1429,16 +1405,11 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
   int                  bi;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
   ystride=_enc->state.ref_ystride[0];
   fragi=sb_maps[_mbi>>2][_mbi&3][0];
   frag_offs=frag_buf_offs[fragi];
-  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-  }
-  else{
-    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
-  }
+  satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
   nqis=_enc->state.nqis;
   lambda=_enc->lambda;
   for(qii=0;qii<nqis;qii++){
@@ -1451,12 +1422,7 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
   for(bi=1;bi<4;bi++){
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
     frag_offs=frag_buf_offs[fragi];
-    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-    }
-    else{
-      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
-    }
+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
     for(qii=0;qii<nqis;qii++){
       oc_qii_state qt[3];
       unsigned     cur_ssd;
@@ -1521,22 +1487,17 @@ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
   oc_qii_state         qt[3];
   unsigned             cost[3];
   unsigned             satd;
-  int                  dc;
+  unsigned             dc;
   unsigned             best_cost;
   int                  best_qii;
   int                  qii;
   int                  lambda;
   int                  ystride;
   int                  nqis;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
   ystride=_enc->state.ref_ystride[_pli];
   frag_offs=_enc->state.frag_buf_offs[_fragi];
-  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-  }
-  else{
-    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
-  }
+  satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
   /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
      worth spending the bits to change the AC quantizer.
     TODO: This may be worth revisiting when we separate out DC and AC
@@ -1576,20 +1537,23 @@ static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
   oc_token_checkpoint  stack[64*4];
   oc_token_checkpoint *stackptr;
   const oc_sb_map     *sb_maps;
+  signed char         *mb_modes;
   oc_fragment         *frags;
   ptrdiff_t           *coded_fragis;
   ptrdiff_t            ncoded_fragis;
+  int                  mb_mode;
   ptrdiff_t            fragi;
   int                  bi;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_modes=_enc->state.mb_modes;
   frags=_enc->state.frags;
   coded_fragis=_pipe->coded_fragis[0];
   ncoded_fragis=_pipe->ncoded_fragis[0];
+  mb_mode=mb_modes[_mbi];
   stackptr=stack;
   for(bi=0;bi<4;bi++){
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
-    frags[fragi].refi=OC_FRAME_SELF;
-    frags[fragi].mb_mode=OC_MODE_INTRA;
+    frags[fragi].mb_mode=mb_mode;
     oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
      _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
     coded_fragis[ncoded_fragis++]=fragi;
@@ -1711,8 +1675,8 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
         }
         else{
           unsigned intra_satd[12];
-          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
-          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+          oc_mb_intra_satd(_enc,mbi,intra_satd);
+          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
           for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
         }
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
@@ -1737,7 +1701,6 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
           pli=mapi>>2;
           bi=mapi&3;
           fragi=mb_maps[mbi][pli][bi];
-          frags[fragi].refi=OC_FRAME_SELF;
           frags[fragi].mb_mode=OC_MODE_INTRA;
         }
         /*Save masking scale factors for chroma blocks.*/
@@ -1866,7 +1829,7 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
         best_qii=qii;
       }
     }
-    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
+    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE)&&nskipped<3){
       *(ft+1)=*&fr;
       oc_fr_skip_block(ft+1);
       cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
@@ -1947,7 +1910,7 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
           best_qii=qii;
         }
       }
-      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
+      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE)){
         cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
         cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
         if(cur_cost<=best_cost){
@@ -1986,8 +1949,8 @@ static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
   int                    borderi;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
   ystride=_enc->state.ref_ystride[0];
   frags=_enc->state.frags;
   frag_buf_offs=_enc->state.frag_buf_offs;
@@ -2080,9 +2043,10 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   int                    bi;
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
-  int                    dc;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
+  unsigned               dc;
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
@@ -2091,30 +2055,18 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
     for(bi=0;bi<4;bi++){
       fragi=sb_map[bi];
       frag_offs=frag_buf_offs[fragi];
-      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
-        frag_satd[bi]+=abs(dc);
-      }
-      else{
-        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
-      }
+      frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+      frag_satd[bi]+=dc;
     }
   }
   else{
     for(bi=0;bi<4;bi++){
       fragi=sb_map[bi];
       frag_offs=frag_buf_offs[fragi];
-      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ystride);
-        frag_satd[bi]+=abs(dc);
-      }
-      else{
-        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ystride);
-      }
+      frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
+      frag_satd[bi]+=dc;
     }
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -2129,15 +2081,9 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
       bi=mapi&3;
       fragi=mb_map[pli][bi];
       frag_offs=frag_buf_offs[fragi];
-      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
-        frag_satd[mapii]+=abs(dc);
-      }
-      else{
-        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
-      }
+      frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+      frag_satd[mapii]+=dc;
     }
   }
   else{
@@ -2147,15 +2093,9 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
       bi=mapi&3;
       fragi=mb_map[pli][bi];
       frag_offs=frag_buf_offs[fragi];
-      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ystride);
-        frag_satd[mapii]+=abs(dc);
-      }
-      else{
-        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
-         ref+frag_offs+mv_offs[0],ystride);
-      }
+      frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
+      frag_satd[mapii]+=dc;
     }
   }
   oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
@@ -2215,9 +2155,9 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
   int                    bits0;
   int                    bits1;
   unsigned               satd;
-  int                    dc;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
+  unsigned               dc;
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   frag_mvs=_enc->state.frag_mvs;
@@ -2237,7 +2177,7 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
       satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+dc;
   }
   oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
    _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
@@ -2275,7 +2215,7 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
       satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[mapii]=satd+abs(dc);
+    frag_satd[mapii]=satd+dc;
   }
   oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
    frag_satd,_skip_ssd,_rd_scale[4],1);
@@ -2380,7 +2320,6 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         int            mb_gmv_bits_0;
         int            inter_mv_pref;
         int            mb_mode;
-        int            refi;
         int            mv;
         unsigned       mbi;
         int            mapii;
@@ -2388,12 +2327,14 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         int            bi;
         ptrdiff_t      fragi;
         mbi=sbi<<2|quadi;
-        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+        oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Activity masking.*/
         if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
-          oc_mb_activity(_enc,mbi,activity);
+          luma=oc_mb_activity(_enc,mbi,activity);
+        }
+        else{
+          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
         }
-        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
         luma_sum+=luma;
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
          chroma_rd_scale,activity,activity_avg,luma,luma_avg);
@@ -2564,7 +2505,6 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
           int orig_mb_mode;
           orig_mb_mode=mb_mode;
           mb_mode=mb_modes[mbi];
-          refi=OC_FRAME_FOR_MODE(mb_mode);
           switch(mb_mode){
             case OC_MODE_INTER_MV:{
               prior_mv=last_mv;
@@ -2617,9 +2557,8 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
                 pli=mapi>>2;
                 bi=mapi&3;
                 fragi=mb_maps[mbi][pli][bi];
-                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
-                frags[fragi].refi=refi;
                 frags[fragi].mb_mode=mb_mode;
+                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
                 frag_mvs[fragi]=cbmvs[bi];
               }
             }break;
@@ -2631,7 +2570,6 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
         else{
           *(uncoded_mbis-++nuncoded_mbis)=mbi;
           mb_mode=OC_MODE_INTER_NOMV;
-          refi=OC_FRAME_PREV;
           mv=0;
         }
         /*Propagate final MB mode and MVs to the chroma blocks.
@@ -2643,12 +2581,11 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
             pli=mapi>>2;
             bi=mapi&3;
             fragi=mb_maps[mbi][pli][bi];
+            frags[fragi].mb_mode=mb_mode;
             /*If we switched from 4MV mode to INTER_MV mode, then the qii
                values won't have been chosen with the right MV, but it's
                probaby not worth re-estimating them.*/
             frags[fragi].qii=modes[mb_mode].qii[mapii];
-            frags[fragi].refi=refi;
-            frags[fragi].mb_mode=mb_mode;
             frag_mvs[fragi]=mv;
           }
         }

+ 271 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/arm2gnu.pl

@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata\n    .align 2/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data\n    .align 2/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        print "    .thumb_func" if ($thumb);
+        s/\bPROC\b/@ $&/;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    s/\bENDP\b/@ $&/;
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+
+            print "        .byte      0x".$w1;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w4;
+        }
+        else
+        {
+            # little endian
+
+            print "        .byte      0x".$w4;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w1;
+        }
+
+    }
+
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+

+ 227 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armbits.s

@@ -0,0 +1,227 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id$
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	EXPORT oc_pack_read_arm
+	EXPORT oc_pack_read1_arm
+	EXPORT oc_huff_token_decode_arm
+
+oc_pack_read_arm
+	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+oc_pack_read1_arm
+	; r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+	MOV r1,#1
+oc_pack_read_refill
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     ; r10 = stop
+	                       ; r11 = ptr
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	RSB r3,r3,r0           ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMP r10,r11            ; ptr<stop => HI
+	CMPHI r3,#7            ;   available<=24 => HI
+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
+	SUBHI r3,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;     ptr<stop => HI
+	CMPHI r3,#7            ;       available<=24 => HI
+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
+	SUBHI r3,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;         ptr<stop => HI
+	CMPHI r3,#7            ;           available<=24 => HI
+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
+	SUBHI r3,#8            ;             available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;             ptr<stop => HI
+	CMPHI r3,#7            ;               available<=24 => HI
+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
+	SUBHI r3,#8            ;                 available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+;  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+	CMP r11,r10            ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+;  to fill up the window.
+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           ; (HS) r14 = 1
+	ADDLO r10,r3,r1        ; (LO) r10 = available
+	STRHS r14,[r12,#8]     ; (HS) eof = 1
+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+
+
+oc_huff_token_decode_arm
+	; r0 = oc_pack_buf       *_b
+	; r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       ; r2 = stop
+	; Stall...             ; r3 = ptr
+	; Stall...             ; r4 = window
+	                       ; r5 = available
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r5,r10          ; r5 = available-=n
+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
+	; Stall...             ; r12 = _tree+node+1
+	; Stall...
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+	ADD r12,r1,#2          ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              ; ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
+	CMPHI r2,r3            ;   ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
+	SUBHI r5,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMPHI r2,r3            ;     ptr<stop => HI
+	CMPHI r5,#7            ;       available<=24 => HI
+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
+	SUBHI r5,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
+	CMP r2,r3              ; ptr<stop => HI
+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            ; (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
+	SUBHI r5,#8            ; (HI)   available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          ; r5 = available
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+	END

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armcpu.c

@@ -20,7 +20,7 @@
 #include "armcpu.h"
 
 #if !defined(OC_ARM_ASM)|| \
- !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_MEDIA)&& \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
  !defined(OC_ARM_ASM_NEON)
 ogg_uint32_t oc_cpu_flags_get(void){
   return 0;

+ 645 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armfrag.s

@@ -0,0 +1,645 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id$
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+; Vanilla ARM v4 versions
+	EXPORT	oc_frag_copy_list_arm
+	EXPORT	oc_frag_recon_intra_arm
+	EXPORT	oc_frag_recon_inter_arm
+	EXPORT	oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp
+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+
+oc_frag_recon_inter_arm
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src
+	; r2 =       int            ystride
+	; r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+
+oc_frag_recon_inter2_arm
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src1
+	; r2 = const unsigned char *src2
+	; r3 =       int            ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+
+ [ OC_ARM_ASM_EDSP
+	EXPORT	oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	; Stall
+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	; Stall
+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
+				; another pair of LDRD/STRD later on.
+	; Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end
+	LDMFD	r13!,{r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_frag_recon_intra_v6
+	EXPORT	oc_frag_recon_inter_v6
+	EXPORT	oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp
+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		; r2 = __11__00
+	USAT16	r3, #8, r3		; r3 = __33__22
+	USAT16	r4, #8, r4		; r4 = __55__44
+	USAT16	r5, #8, r5		; r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+
+oc_frag_recon_inter_v6
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp
+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+ ]
+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r6,r4			; r6 = __22__00
+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
+	QADD16	r12,r12,r6		; r12= xx22xx00
+	QADD16	r4, r7, r4		; r4 = xx33xx11
+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		; r4 = __33__11
+	USAT16	r12,#8,r12		; r12= __22__00
+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
+	UXTB16	r6,r5			; r6 = __66__44
+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
+	QADD16	r12,r12,r6		; r12= xx66xx44
+	QADD16	r5, r7, r5		; r5 = xx77xx55
+	USAT16	r12,#8, r12		; r12= __66__44
+	USAT16	r5, #8, r5		; r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+
+oc_frag_recon_inter2_v6
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp
+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			; r5 = __66__44
+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
+	QADD16	r8, r8, r5		; r8 = xx66xx44
+	QADD16	r9, r9, r4		; r9 = xx77xx55
+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		; r8 = __66__44
+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		; r9 = __77__55
+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r5, r4			; r5 = __22__00
+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
+	QADD16	r8, r8, r5		; r8 = xx22xx00
+	QADD16	r7, r7, r4		; r7 = xx33xx11
+	USAT16	r8, #8, r8		; r8 = __22__00
+	USAT16	r7, #8, r7		; r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_frag_copy_list_neon
+	EXPORT	oc_frag_recon_intra_neon
+	EXPORT	oc_frag_recon_inter_neon
+	EXPORT	oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	; Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	; Stall (on XScale)
+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
+ofcl_neon_lp
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4@64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4@64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4@64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4@64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4@64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4@64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4@64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4@64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5@64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5@64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5@64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5@64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r7,PC}
+
+oc_frag_recon_intra_neon
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	MOV	r3, #128
+	VDUP.S16	Q0, r3
+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0@64], r1
+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0@64], r1
+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0@64], r1
+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0@64], r1
+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0@64], r1
+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0@64], r1
+	VST1.64	{D22},[r0@64], r1
+	VST1.64	{D23},[r0@64], r1
+	MOV	PC,R14
+
+oc_frag_recon_inter_neon
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	; etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r2
+	VST1.64	{D22},[r0@64], r2
+	VST1.64	{D23},[r0@64], r2
+	MOV	PC,R14
+
+oc_frag_recon_inter2_neon
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		; etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r3
+	VST1.64	{D22},[r0@64], r3
+	VST1.64	{D23},[r0@64], r3
+	MOV	PC,R14
+ ]
+
+	END

+ 1876 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armidct.s

@@ -0,0 +1,1876 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id$
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_idct8x8_1_arm
+	EXPORT	oc_idct8x8_arm
+
+oc_idct8x8_1_arm
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+
+oc_idct8x8_arm
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_arm_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+oc_idct8x8_slow_arm_cols
+; Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_10_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #4*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_10_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#20]
+	STR	r4, [r0,#32]
+	STR	r4, [r0,#48]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_10_arm_cols
+; Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_6_arm
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_6_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_6_arm_cols
+; Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+
+oc_idct8x8_3_arm
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r2		; Write to the final destination
+; Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+
+idct1core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+
+idct2core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]
+	ADD	r11,r11,r9		; r11= t[0]+t[7]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+
+idct2core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+
+idct3core_arm
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
+					; r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7]
+	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		; r12= t[2]+t2[5]
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+
+idct3core_down_arm
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
+					; r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+
+idct4core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		; r11= t[0]+t2[7]
+	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+
+idct4core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+
+idct8core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; Stage 2
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7]
+	ADD	r6, r6, r10		; r6 = t[1] + t[6]
+	ADD	r3, r3, r14		; r3 = t[2] + t[5]
+	ADD	r4, r4, r9		; r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+
+idct8core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+	; Stage 2
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_idct8x8_1_v6
+	EXPORT	oc_idct8x8_v6
+
+oc_idct8x8_1_v6
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	MOV	PC, r14
+
+oc_idct8x8_v6
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	;CMP	r2, #6
+	;BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+oc_idct8x8_slow_v6_cols
+; Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_10_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	; Align the stack.
+	ADD	r0, r0, r2	; Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #4*16
+	CMP	r0, r2
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_10_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2]
+	STRD	r4, [r2,#16]
+	STR	r4, [r2,#32]
+	STR	r4, [r2,#48]
+oc_idct8x8_10_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_3_v6
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2_1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r8
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r8		; Write to the final destination.
+; Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+
+idct2_1core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		; r6 = x[1,0]
+	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+; Stage 2:
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
+; Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
+	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+; Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+ ]
+
+	ALIGN 8
+OC_C7S1
+	DCD	12785 ; 31F1
+OC_C1S7
+	DCD	64277 ; FB15
+OC_C6S2
+	DCD	25080 ; 61F8
+OC_C2S6
+	DCD	60547 ; EC83
+OC_C5S3
+	DCD	36410 ; 8E3A
+OC_C3S5
+	DCD	54491 ; D4DB
+OC_C4S4
+	DCD	46341 ; B505
+
+ [ OC_ARM_ASM_MEDIA
+idct2_2core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			; r7  = 8
+	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
+; Stage 2:
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
+; Stage 3:
+	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+; Stage 4:
+	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+
+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+;  pay for increased branch mis-prediction to get here, but in practice it
+;  doesn't seem to slow anything down to take it out, and it's less code this
+;  way.
+ [ 0
+oc_idct8x8_6_v6
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	; Align the stack.
+	ADD	r0, r0, r13	; Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r8
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_6_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r0]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r8		; Write to the final destination.
+oc_idct8x8_6_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+
+idct1core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	; Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+
+idct3_2core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
+	; Stall
+	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_3core_stage3_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_3_v6
+	DCD	12785 ; 31F1
+OC_C1S7_3_v6
+	DCD	64277 ; FB15
+OC_C6S2_3_v6
+	DCD	25080 ; 61F8
+OC_C2S6_3_v6
+	DCD	60547 ; EC83
+
+idct3_3core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_4core_down_stage3_v6
+ ]
+
+idct4_3core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_3core_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
+	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]
+	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]
+	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]
+	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]
+	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_4_v6
+	DCD	12785 ; 31F1
+OC_C1S7_4_v6
+	DCD	64277 ; FB15
+OC_C6S2_4_v6
+	DCD	25080 ; 61F8
+OC_C2S6_4_v6
+	DCD	60547 ; EC83
+OC_C5S3_4_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_4_v6
+	DCD	54491 ; D4DB
+
+idct4_4core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_4core_down_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+
+idct8_8core_v6
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_8_v6
+	DCD	12785 ; 31F1
+OC_C1S7_8_v6
+	DCD	64277 ; FB15
+OC_C6S2_8_v6
+	DCD	25080 ; 61F8
+OC_C2S6_8_v6
+	DCD	60547 ; EC83
+OC_C5S3_8_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_8_v6
+	DCD	54491 ; D4DB
+
+idct8_8core_down_v6
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_idct8x8_1_neon
+	EXPORT	oc_idct8x8_neon
+
+	ALIGN 16
+OC_IDCT_CONSTS_NEON
+	DCW	    8
+	DCW	64277 ; FB15 (C1S7)
+	DCW	60547 ; EC83 (C2S6)
+	DCW	54491 ; D4DB (C3S5)
+	DCW	46341 ; B505 (C4S4)
+	DCW	36410 ; 471D (C5S3)
+	DCW	25080 ; 30FC (C6S2)
+	DCW	12785 ; 31F1 (C7S1)
+
+oc_idct8x8_1_neon
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]
+	MOV	PC, r14
+
+oc_idct8x8_neon
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	; Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2@128]
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3@128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	; 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	; Column transforms
+	BL	oc_idct8x8_stage123_neon
+	CMP	r0,r1
+	; We have to put the return address back in the LR, or the branch
+	;  predictor will not recognize the function return and mis-predict the
+	;  entire call stack.
+	MOV	r14, r12
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	BEQ		oc_idct8x8_slow_neon_noclear
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_slow_neon_noclear
+	VPOP		{D8-D15}
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_stage123_neon
+; Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
+; Stage 3
+	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
+	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
+	MOV	PC, r14
+
+oc_idct8x8_10_neon
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3@128]
+	MOV	r2, r1
+	; Row transforms (input is pre-transposed)
+; Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2@128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2@64], r12
+	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2@64]
+	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
+	VADD.S16	D4, D4, D18	; D4 = t[7]
+	VADD.S16	D6, D6, D19	; D6 = t[6]
+	VADD.S16	D7, D7, D19	; D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	; D30= t[0]
+					; D31= t[3]
+; Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
+					; D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
+					; D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
+					;       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
+					;       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
+	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
+	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
+	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
+; Stage 4
+	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
+					; D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
+					; D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
+					; D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
+					; D19= y[3]=t[3]'-t[4]''
+	; 8x4 transpose
+	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
+					; Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
+					; Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
+					; Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
+					; Q10= c7c6c5c4 c3c2c1c0
+	; Column transforms
+; Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
+					;           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
+					;           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
+	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
+	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
+	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
+	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
+; Stage 4
+	CMP	r0, r1
+	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
+	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
+	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
+	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
+	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
+	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
+	BEQ	oc_idct8x8_10_neon_noclear
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1@64]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_10_neon_noclear
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+ ]
+
+	END

+ 664 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armloop.s

@@ -0,0 +1,664 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id$
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG	*	1
+
+	; Vanilla ARM v4 version
+loop_filter_h_arm
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp
+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
+	LDRB	r12,[r0, #1]		; r12= _pix[3]
+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+
+loop_filter_v_arm
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp
+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
+	LDRB	r12,[r0, r1]		; r12= _pix[3]
+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+
+oc_loop_filter_frag_rows_arm
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	; _bv += 127
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_loop_filter_init_v6
+	EXPORT	oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+
+; We could use the same strategy as the v filter below, but that would require
+;  40 instructions to load the data and transpose it into columns and another
+;  32 to write out the results at the end, plus the 52 instructions to do the
+;  filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+;  52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+;  proposed for FFmpeg, but not by much:
+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+;  of four.
+loop_filter_h_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+
+loop_filter_h_core_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; r12= 0x10003
+	; Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
+	; Single issue
+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
+	; Single issue
+	; There's no min, max or abs instruction.
+	; SSUB8 and SEL will work for abs, and we can do all the rest with
+	;  unsigned saturated adds, which means the GE flags are still all
+	;  set when we're done computing lflim(abs(R_i),L).
+	; This allows us to both add and subtract, and split the results by
+	;  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	; Single issue
+	SEL	r7, r7, r6		; r7 = abs(R_i)
+	; Single issue
+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
+	; Single issue
+	UQADD8	r7, r7, r4
+	; Single issue
+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	; Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		; r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		; r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		; r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		; r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		; r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		; r4 = q1
+	STRB	r4, [r0,#-1]
+	; Single issue
+	STRB	r9, [r0,-r1]!
+	; Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+;  filter value, f:
+;   u = ~UHADD8(p1,~p2);
+;   v = UHADD8(~p1,p2);
+;   m = v-u;
+;   a = m^UHADD8(m^p0,m^~p3);
+;   f = UHADD8(UHADD8(a,u1),v1);
+;  where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+;  as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+;  but requires more code, because it does all eight columns at once, instead
+;  of four at a time.
+loop_filter_v_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
+	MVN	r14,r6			; r14= ~p1
+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
+	; Filter the first four columns.
+	MVN	r12,r8			; r12= ~p2
+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
+	MVN	r10, r10		; r10=~p3
+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = m1^p0
+	EOR	r10,r10,r14		; r10= m1^~p3
+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		; r14= v1=m1+u1
+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
+	MVN	r12,r9			; r12= ~p6
+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
+	; Filter the second four columns.
+	MVN	r14,r7			; r14= ~p5
+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			; r11=~p7
+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
+	; Single issue
+	EOR	r5, r5, r10		; r5 = m2^p4
+	EOR	r11,r11,r10		; r11= m2^~p7
+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
+	; Single issue
+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	; Single issue
+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
+	; Now split f[i] by sign.
+	; There's no min or max instruction.
+	; We could use SSUB8 and SEL, but this is just as many instructions and
+	;  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_loop_filter_frag_rows_v6
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	; ll = *(int *)_bv
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		; if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_loop_filter_init_neon
+	EXPORT	oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  ; r1 = 2*L
+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0@128]
+	MOV	PC,r14
+
+loop_filter_h_neon
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	; Doing a 2-element structure load saves doing two VTRN's below, at the
+	;  cost of using two more slower single-lane loads vs. the faster
+	;  all-lane loads.
+	; It's less code this way, though, and benches a hair faster, but it
+	;  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
+						; D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
+						; D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
+						; D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
+						; D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
+						; D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
+						; D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
+						; D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
+						; D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+
+loop_filter_v_neon
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12@64], r1		; D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12@64], r1		; D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12@64], r1		; D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12@64]			; D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12@64], r1
+	VST1.64	{D4}, [r12@64], r1
+	MOV	PC,r14
+
+oc_loop_filter_frag_rows_neon
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	;		  bail
+	VLD1.64	{D30,D31}, [r2@128]	; Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+ ]
+
+	END

+ 39 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armopts.s.in

@@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id$
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	@HAVE_ARM_ASM_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	@HAVE_ARM_ASM_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	@HAVE_ARM_ASM_NEON@
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END

+ 27 - 15
love/src/jni/libtheora-1.2.0alpha1/lib/arm/armstate.c

@@ -100,7 +100,7 @@ void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            refi;
+  int            mb_mode;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -117,14 +117,18 @@ void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  refi=_state->frags[_fragi].refi;
+  mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
-  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
@@ -140,7 +144,7 @@ void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            refi;
+  int            mb_mode;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -157,14 +161,18 @@ void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  refi=_state->frags[_fragi].refi;
+  mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
-  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
@@ -180,7 +188,7 @@ void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            refi;
+  int            mb_mode;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -197,14 +205,18 @@ void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  refi=_state->frags[_fragi].refi;
+  mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
-  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,

+ 0 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/bitpack.h

@@ -18,7 +18,6 @@
 # define _bitpack_H (1)
 # include <stddef.h>
 # include <limits.h>
-# include "internal.h"
 
 
 

+ 97 - 137
love/src/jni/libtheora-1.2.0alpha1/lib/collect.c

@@ -5,7 +5,7 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011                *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
@@ -23,10 +23,8 @@
 #if defined(OC_COLLECT_METRICS)
 
 int              OC_HAS_MODE_METRICS;
-double           OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
-double           OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
-oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
-oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+double           OC_MODE_RD_WEIGHT[OC_LOGQ_BINS][3][2][OC_SAD_BINS];
+oc_mode_metrics  OC_MODE_METRICS[OC_LOGQ_BINS-1][3][2][OC_SAD_BINS];
 const char      *OC_MODE_METRICS_FILENAME="modedec.stats";
 
 void oc_mode_metrics_add(oc_mode_metrics *_metrics,
@@ -413,9 +411,7 @@ double oc_mode_metrics_solve(double *_r,double *_d,
 
 /*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
    useful for mode decision.*/
-void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
- int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
- int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
+void oc_mode_metrics_update(int _niters_min,int _reweight){
   int niters;
   int prevdr;
   int prevdd;
@@ -428,7 +424,7 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
   dd=dr=INT_MAX;
   niters=0;
   /*The encoder interpolates rate and RMSE terms bilinearly from an
-     OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
+     OC_LOGQ_BINS by OC_SAD_BINS grid of sample points in OC_MODE_RD.
     To find the sample values at the grid points that minimize the total
      squared prediction error actually requires solving a relatively sparse
      linear system with a number of variables equal to the number of grid
@@ -443,7 +439,7 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
     for(pli=0;pli<3;pli++){
       for(qti=0;qti<2;qti++){
         for(qi=0;qi<OC_LOGQ_BINS;qi++){
-          for(si=0;si<OC_COMP_BINS;si++){
+          for(si=0;si<OC_SAD_BINS;si++){
             oc_mode_metrics m[4];
             int             s0[4];
             int             s1[4];
@@ -467,58 +463,58 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
             if(qi>0&&si>0){
               q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
               q1[n]=OC_MODE_LOGQ[qi][pli][qti];
-              s0[n]=si-1<<_shift;
-              s1[n]=si<<_shift;
-              ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
-              da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
-              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
-              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
-              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
-              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
-              *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
+              s0[n]=si-1<<OC_SAD_SHIFT;
+              s1[n]=si<<OC_SAD_SHIFT;
+              ra[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(OC_MODE_METRICS[qi-1][pli][qti]+si-1);
             }
             if(qi>0){
-              ds=si+1<OC_COMP_BINS?1:-1;
+              ds=si+1<OC_SAD_BINS?1:-1;
               q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
               q1[n]=OC_MODE_LOGQ[qi][pli][qti];
-              s0[n]=si+ds<<_shift;
-              s1[n]=si<<_shift;
-              ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              s0[n]=si+ds<<OC_SAD_SHIFT;
+              s1[n]=si<<OC_SAD_SHIFT;
+              ra[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
               da[n]=
-               ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
-              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
-              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
-              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
-              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
-              *(m+n++)=*(_metrics[qi-1][pli][qti]+si);
+               ldexp(OC_MODE_RD[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(OC_MODE_RD[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(OC_MODE_METRICS[qi-1][pli][qti]+si);
             }
             if(qi+1<OC_LOGQ_BINS&&si>0){
               q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
               q1[n]=OC_MODE_LOGQ[qi][pli][qti];
-              s0[n]=si-1<<_shift;
-              s1[n]=si<<_shift;
-              ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
-              da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
-              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
-              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
-              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
-              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
-              *(m+n++)=*(_metrics[qi][pli][qti]+si-1);
+              s0[n]=si-1<<OC_SAD_SHIFT;
+              s1[n]=si<<OC_SAD_SHIFT;
+              ra[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(OC_MODE_METRICS[qi][pli][qti]+si-1);
             }
             if(qi+1<OC_LOGQ_BINS){
-              ds=si+1<OC_COMP_BINS?1:-1;
+              ds=si+1<OC_SAD_BINS?1:-1;
               q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
               q1[n]=OC_MODE_LOGQ[qi][pli][qti];
-              s0[n]=si+ds<<_shift;
-              s1[n]=si<<_shift;
-              ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              s0[n]=si+ds<<OC_SAD_SHIFT;
+              s1[n]=si<<OC_SAD_SHIFT;
+              ra[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
               da[n]=
-               ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
-              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
-              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
-              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
-              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
-              *(m+n++)=*(_metrics[qi][pli][qti]+si);
+               ldexp(OC_MODE_RD[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(OC_MODE_RD[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(OC_MODE_RD[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(OC_MODE_METRICS[qi][pli][qti]+si);
             }
             /*On the first pass, initialize with a simple weighted average of
                the neighboring bins.*/
@@ -532,19 +528,19 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
               }
               r=w>1E-3?r/w:0;
               d=w>1E-3?d/w:0;
-              _weight[qi][pli][qti][si]=w;
+              OC_MODE_RD_WEIGHT[qi][pli][qti][si]=w;
             }
             else{
               /*Update the grid point and save the weight for later.*/
-              _weight[qi][pli][qti][si]=
+              OC_MODE_RD_WEIGHT[qi][pli][qti][si]=
                oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
             }
             rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
             rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
-            dr+=abs(rate-_table[qi][pli][qti][si].rate);
-            dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
-            _table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
-            _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
+            dr+=abs(rate-OC_MODE_RD[qi][pli][qti][si].rate);
+            dd+=abs(rmse-OC_MODE_RD[qi][pli][qti][si].rmse);
+            OC_MODE_RD[qi][pli][qti][si].rate=(ogg_int16_t)rate;
+            OC_MODE_RD[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
           }
         }
       }
@@ -560,17 +556,17 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
        samples in each bin to overcome the constant OC_ZWEIGHT factor.
       This encourages sampling under-populated bins and prevents a single large
        sample early on from discouraging coding in that bin ever again.*/
-    for(pli=0;pli<3;pli++){
+    for(pli=0;pli<3;pli++){ 
       for(qti=0;qti<2;qti++){
         for(qi=0;qi<OC_LOGQ_BINS;qi++){
-          for(si=0;si<OC_COMP_BINS;si++){
+          for(si=0;si<OC_SAD_BINS;si++){
             double wt;
-            wt=_weight[qi][pli][qti][si];
+            wt=OC_MODE_RD_WEIGHT[qi][pli][qti][si];
             wt/=OC_ZWEIGHT+wt;
-            _table[qi][pli][qti][si].rate=(ogg_int16_t)
-             (_table[qi][pli][qti][si].rate*wt+0.5);
-            _table[qi][pli][qti][si].rmse=(ogg_int16_t)
-             (_table[qi][pli][qti][si].rmse*wt+0.5);
+            OC_MODE_RD[qi][pli][qti][si].rate=(ogg_int16_t)
+             (OC_MODE_RD[qi][pli][qti][si].rate*wt+0.5);
+            OC_MODE_RD[qi][pli][qti][si].rmse=(ogg_int16_t)
+             (OC_MODE_RD[qi][pli][qti][si].rmse*wt+0.5);
           }
         }
       }
@@ -578,31 +574,48 @@ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
   }
 }
 
-/*Dump the in memory mode metrics to a file.
-  Note this data format isn't portable between different platforms.*/
 void oc_mode_metrics_dump(void){
   FILE *fmetrics;
   fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
   if(fmetrics!=NULL){
+    (void)fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
     (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
-    (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
-    (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
     fclose(fmetrics);
   }
 }
 
-void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
-#if !defined(OC_COLLECT_METRICS)
- const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
-#else
- oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
-#endif
+void oc_mode_metrics_print(FILE *_fout){
   int qii;
   fprintf(_fout,
+   "/*File generated by libtheora with OC_COLLECT_METRICS"
+   " defined at compile time.*/\n"
+   "#if !defined(_modedec_H)\n"
+   "# define _modedec_H (1)\n"
+   "# include \"encint.h\"\n"
+   "\n"
+   "\n"
+   "\n"
+   "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
+   "   (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
+   "  The actual statistics used by the encoder will be interpolated from\n"
+   "   that table based on log_plq for the actual quantization matrix used.*/\n"
    "# if !defined(OC_COLLECT_METRICS)\n"
    "static const\n"
    "# endif\n"
-   "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
+   "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
+    fprintf(_fout,"  { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
+     OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
+     OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
+     qii+1<OC_LOGQ_BINS?",":"");
+  }
+  fprintf(_fout,
+   "};\n"
+   "\n"
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "oc_mode_rd OC_MODE_RD[OC_LOGQ_BINS][3][2][OC_SAD_BINS]={\n");
   for(qii=0;qii<OC_LOGQ_BINS;qii++){
     int pli;
     fprintf(_fout,"  {\n");
@@ -619,12 +632,12 @@ void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
          pl_names[pli],qi,qti_names[qti]);
         fprintf(_fout,"      {\n");
         fprintf(_fout,"        ");
-        for(bin=0;bin<OC_COMP_BINS;bin++){
+        for(bin=0;bin<OC_SAD_BINS;bin++){
           if(bin&&!(bin&0x3))fprintf(_fout,"\n        ");
           fprintf(_fout,"{%5i,%5i}",
-           _mode_rd_table[qii][pli][qti][bin].rate,
-           _mode_rd_table[qii][pli][qti][bin].rmse);
-          if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
+           OC_MODE_RD[qii][pli][qti][bin].rate,
+           OC_MODE_RD[qii][pli][qti][bin].rmse);
+          if(bin+1<OC_SAD_BINS)fprintf(_fout,",");
         }
         fprintf(_fout,"\n      }");
         if(qti<1)fprintf(_fout,",");
@@ -640,40 +653,7 @@ void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
   }
   fprintf(_fout,
    "};\n"
-   "\n");
-}
-
-void oc_mode_metrics_print(FILE *_fout){
-  int qii;
-  fprintf(_fout,
-   "/*File generated by libtheora with OC_COLLECT_METRICS"
-   " defined at compile time.*/\n"
-   "#if !defined(_modedec_H)\n"
-   "# define _modedec_H (1)\n"
-   "# include \"encint.h\"\n"
-   "\n"
    "\n"
-   "\n"
-   "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
-   "   (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
-   "  The actual statistics used by the encoder will be interpolated from\n"
-   "   that table based on log_plq for the actual quantization matrix used.*/\n"
-   "# if !defined(OC_COLLECT_METRICS)\n"
-   "static const\n"
-   "# endif\n"
-   "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
-  for(qii=0;qii<OC_LOGQ_BINS;qii++){
-    fprintf(_fout,"  { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
-     OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
-     OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
-     qii+1<OC_LOGQ_BINS?",":"");
-  }
-  fprintf(_fout,
-   "};\n"
-   "\n");
-  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
-  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
-  fprintf(_fout,
    "#endif\n");
 }
 
@@ -684,15 +664,11 @@ void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
   /*Load any existing mode metrics if we haven't already.*/
   if(!OC_HAS_MODE_METRICS){
     FILE *fmetrics;
-    memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
-    memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
+    memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
     fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
     if(fmetrics!=NULL){
-      /*Read in the binary structures as written my oc_mode_metrics_dump().
-        Note this format isn't portable between different platforms.*/
+      (void)fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
       (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
-      (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
-      (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
       fclose(fmetrics);
     }
     else{
@@ -707,10 +683,7 @@ void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
         }
       }
     }
-    oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
-     OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
-    oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
-     OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+    oc_mode_metrics_update(100,1);
     OC_HAS_MODE_METRICS=1;
   }
 }
@@ -837,7 +810,6 @@ void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
     64,64,64,64,64,64,64,64
   };
   const oc_fragment *frags;
-  const unsigned    *frag_sad;
   const unsigned    *frag_satd;
   const unsigned    *frag_ssd;
   const ptrdiff_t   *coded_fragis;
@@ -868,7 +840,6 @@ void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
   }
   qti=_enc->state.frame_type;
   frags=_enc->state.frags;
-  frag_sad=_enc->frag_sad;
   frag_satd=_enc->frag_satd;
   frag_ssd=_enc->frag_ssd;
   coded_fragis=_enc->state.coded_fragis;
@@ -905,9 +876,7 @@ void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
       int       huffi;
       int       skip;
       int       mb_mode;
-      unsigned  sad;
       unsigned  satd;
-      double    sqrt_ssd;
       int       bin;
       int       qtj;
       fragi=coded_fragis[fragii];
@@ -945,29 +914,20 @@ void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
       mb_mode=frags[fragi].mb_mode;
       qii=frags[fragi].qii;
       qi=_enc->state.qis[qii];
-      sad=frag_sad[fragi]<<(pli+1&2);
       satd=frag_satd[fragi]<<(pli+1&2);
-      sqrt_ssd=sqrt(frag_ssd[fragi]);
+      bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1);
       qtj=mb_mode!=OC_MODE_INTRA;
       /*Accumulate statistics.
         The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
          OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
          yet still use old data.*/
-      bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
-      oc_mode_metrics_add(
-       OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
-       fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
-      bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
       oc_mode_metrics_add(
-       OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
-       fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
+       OC_MODE_METRICS[modelines[qii][pli][qtj]][pli][qtj]+bin,
+       fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt(frag_ssd[fragi]));
     }
   }
-  /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
-  oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
-   OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
-  oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
-   OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+  /*Update global SATD/logq/rate/RMSE estimation matrix.*/
+  oc_mode_metrics_update(4,1);
 }
 # endif
 

+ 4 - 7
love/src/jni/libtheora-1.2.0alpha1/lib/collect.h

@@ -79,12 +79,10 @@ struct oc_mode_metrics{
    out the contributions from AC and DC into separate tables.*/
 
 extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
-extern oc_mode_rd  OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
-extern oc_mode_rd  OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+extern oc_mode_rd  OC_MODE_RD[OC_LOGQ_BINS][3][2][OC_SAD_BINS];
 
 extern int              OC_HAS_MODE_METRICS;
-extern oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
-extern oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+extern oc_mode_metrics  OC_MODE_METRICS[OC_LOGQ_BINS-1][3][2][OC_SAD_BINS];
 extern const char      *OC_MODE_METRICS_FILENAME;
 
 void oc_mode_metrics_dump();
@@ -99,9 +97,8 @@ double oc_mode_metrics_solve(double *_r,double *_d,
  const int *_q0,const int *_q1,
  const double *_ra,const double *_rb,const double *_rc,
  const double *_da,const double *_db,const double *_dc,int _n);
-void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
- int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
- int shift,double (*_weight)[3][2][OC_COMP_BINS]);
+void oc_mode_metrics_update(int _niters_min,int _reweight);
+
 void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
 void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
 

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/decint.h

@@ -105,7 +105,7 @@ struct oc_dec_pipeline_state{
   const ogg_uint16_t *dequant[3][3][2];
   int                 fragy0[3];
   int                 fragy_end[3];
-  int                 pred_last[3][4];
+  int                 pred_last[3][3];
   int                 mcu_nvfrags;
   int                 loop_filter;
   int                 pp_level;

File diff suppressed because it is too large
+ 115 - 765
love/src/jni/libtheora-1.2.0alpha1/lib/decode.c


+ 19 - 40
love/src/jni/libtheora-1.2.0alpha1/lib/encfrag.c

@@ -86,27 +86,6 @@ unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
   return sad;
 }
 
-unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
-  const unsigned char *src = _src;
-  unsigned dc;
-  unsigned sad;
-  int      i;
-  dc=0;
-  for(i=8;i-->0;){
-    int j;
-    for(j=0;j<8;j++)dc+=src[j];
-    src+=_ystride;
-  }
-  dc=dc+32>>6;
-  sad=0;
-  for(i=8;i-->0;){
-    int j;
-    for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
-    _src+=_ystride;
-  }
-  return sad;
-}
-
 static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   int i;
@@ -262,19 +241,19 @@ static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
   }
 }
 
-unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
-  unsigned sad;
-  int      dc;
-  int      t0;
-  int      t1;
-  int      t2;
-  int      t3;
-  int      t4;
-  int      t5;
-  int      t6;
-  int      t7;
-  int      r;
-  int      i;
+unsigned oc_hadamard_sad(unsigned *_dc,const ogg_int16_t _buf[64]){
+  unsigned    sad;
+  unsigned    dc;
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         r;
+  int         i;
   sad=dc=0;
   for(i=0;i<8;i++){
     /*Hadamard stage 1:*/
@@ -300,7 +279,7 @@ unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
     t5+=t7;
     t7=r-t7;
     /*Hadamard stage 3:*/
-    r=abs(t0+t1)&-(i>0);
+    r=abs(t0+t1);
     r+=abs(t0-t1);
     r+=abs(t2+t3);
     r+=abs(t2-t3);
@@ -310,26 +289,26 @@ unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
     r+=abs(t6-t7);
     sad+=r;
   }
-  dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
+  dc=abs(_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7]);
   *_dc=dc;
-  return sad;
+  return sad-dc;
 }
 
-unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard(buf,_src,_ref,_ystride);
   return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
   return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_intra_satd_c(int *_dc,
+unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,
  const unsigned char *_src,int _ystride){
   ogg_int16_t buf[64];
   oc_intra_hadamard(buf,_src,_ystride);

+ 22 - 42
love/src/jni/libtheora-1.2.0alpha1/lib/encint.h

@@ -51,9 +51,6 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #   include "x86/x86enc.h"
 #  endif
 # endif
-# if defined(OC_ARM_ASM)
-#  include "arm/armenc.h"
-# endif
 
 # if !defined(oc_enc_accel_init)
 #  define oc_enc_accel_init oc_enc_accel_init_c
@@ -79,10 +76,6 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
   ((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
 #  endif
-#  if !defined(oc_enc_frag_intra_sad)
-#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
-  ((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
-#  endif
 #  if !defined(oc_enc_frag_satd)
 #   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
   ((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
@@ -152,10 +145,6 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
   oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
 #  endif
-#  if !defined(oc_enc_frag_intra_sad)
-#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
-  oc_enc_frag_intra_sad_c(_src,_ystride)
-#  endif
 #  if !defined(oc_enc_frag_satd)
 #   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
   oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
@@ -220,12 +209,10 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 #define OC_SP_LEVEL_EARLY_SKIP    (1)
 /*Use analysis shortcuts, single quantizer, and faster tokenization.*/
 #define OC_SP_LEVEL_FAST_ANALYSIS (2)
-/*Use SAD instead of SATD*/
-#define OC_SP_LEVEL_NOSATD        (3)
 /*Disable motion compensation.*/
-#define OC_SP_LEVEL_NOMC          (4)
+#define OC_SP_LEVEL_NOMC          (3)
 /*Maximum valid speed level.*/
-#define OC_SP_LEVEL_MAX           (4)
+#define OC_SP_LEVEL_MAX           (3)
 
 
 /*The number of extra bits of precision at which to store rate metrics.*/
@@ -235,12 +222,12 @@ typedef struct oc_token_checkpoint    oc_token_checkpoint;
 # define OC_RMSE_SCALE (5)
 /*The number of quantizer bins to partition statistics into.*/
 # define OC_LOGQ_BINS  (8)
-/*The number of SAD/SATD bins to partition statistics into.*/
-# define OC_COMP_BINS   (24)
-/*The number of bits of precision to drop from SAD and SATD scores
-   to assign them to a bin.*/
-# define OC_SAD_SHIFT  (6)
-# define OC_SATD_SHIFT (9)
+/*The number of SATD bins to partition statistics into.*/
+# define OC_SAD_BINS   (24)
+/*The number of bits of precision to drop from SAD scores to assign them to a
+   bin.*/
+# define OC_SAD_SHIFT  (9)
+
 
 /*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
    or the lambda parameter (via rd_iscale).
@@ -302,12 +289,12 @@ struct oc_enc_opt_vtable{
   unsigned (*frag_sad2_thresh)(const unsigned char *_src,
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
    unsigned _thresh);
-  unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
-  unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
+  unsigned (*frag_satd)(unsigned *_dc,const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
-  unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
+  unsigned (*frag_satd2)(unsigned *_dc,const unsigned char *_src,
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-  unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
+  unsigned (*frag_intra_satd)(unsigned *_dc,const unsigned char *_src,
+   int _ystride);
   unsigned (*frag_ssd)(const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
   unsigned (*frag_border_ssd)(const unsigned char *_src,
@@ -457,7 +444,7 @@ struct oc_enc_pipeline_state{
     This is kept off the stack because a) gcc can't align things on the stack
      reliably on ARM, and b) it avoids (unintentional) data hazards between
      ARM and NEON code.*/
-  OC_ALIGN16(ogg_int16_t dct_data[64*3]);
+  OC_ALIGN16(ogg_int16_t dct_data[128]);
   OC_ALIGN16(signed char bounding_values[256]);
   oc_fr_state         fr[3];
   oc_qii_state        qs[3];
@@ -524,8 +511,6 @@ struct oc_frame_metrics{
   unsigned      dup_count:31;
   /*The frame type from pass 1.*/
   unsigned      frame_type:1;
-  /*The frame activity average from pass 1.*/
-  unsigned      activity_avg;
 };
 
 
@@ -691,10 +676,8 @@ struct th_enc_ctx{
   /*The offset of the first DCT token for each coefficient for each plane.*/
   unsigned char            dct_token_offs[3][64];
   /*The last DC coefficient for each plane and reference frame.*/
-  int                      dc_pred_last[3][4];
+  int                      dc_pred_last[3][3];
 #if defined(OC_COLLECT_METRICS)
-  /*Fragment SAD statistics for MB mode estimation metrics.*/
-  unsigned                *frag_sad;
   /*Fragment SATD statistics for MB mode estimation metrics.*/
   unsigned                *frag_satd;
   /*Fragment SSD statistics for MB mode estimation metrics.*/
@@ -721,7 +704,7 @@ struct th_enc_ctx{
   /*Storage for the quantization tables.*/
   unsigned char           *enquant_table_data;
   /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
-    This is used to parameterize the rate control decisions.
+    This is used to paramterize the rate control decisions.
     They are kept in the log domain to simplify later processing.
     These are DCT domain quantizers, and so are scaled by an additional factor
      of 4 from the pixel domain.*/
@@ -738,7 +721,7 @@ struct th_enc_ctx{
   ogg_uint16_t             chroma_rd_scale[2][64][2];
   /*The interpolated mode decision R-D lookup tables for the current
      quantizers, color plane, and quantization type.*/
-  oc_mode_rd               mode_rd[3][3][2][OC_COMP_BINS];
+  oc_mode_rd               mode_rd[3][3][2][OC_SAD_BINS];
   /*The buffer state used to drive rate control.*/
   oc_rc_state              rc;
 # if defined(OC_ENC_USE_VTABLE)
@@ -782,12 +765,10 @@ struct oc_token_checkpoint{
 
 void oc_enc_tokenize_start(oc_enc_ctx *_enc);
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
- const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
- const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
  const oc_token_checkpoint *_stack,int _n);
@@ -822,13 +803,12 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
-unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_c(int *_dc,
- const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,const unsigned char *_src,
+ int _ystride);
 unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,

+ 1 - 18
love/src/jni/libtheora-1.2.0alpha1/lib/encode.c

@@ -942,7 +942,6 @@ void oc_enc_accel_init_c(oc_enc_ctx *_enc){
   _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
   _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
   _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
-  _enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c;
   _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
   _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
   _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
@@ -1181,7 +1180,6 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
      _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
   }
 #if defined(OC_COLLECT_METRICS)
-  _enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad));
   _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
   _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
 #endif
@@ -1214,7 +1212,7 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
    ||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL
    ||_enc->extra_bits[2]==NULL
 #if defined(OC_COLLECT_METRICS)
-   ||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
+   ||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
 #endif
    ||oc_enc_set_quant_params(_enc,NULL)<0){
     oc_enc_clear(_enc);
@@ -1239,7 +1237,6 @@ static void oc_enc_clear(oc_enc_ctx *_enc){
   oc_mode_metrics_dump();
   _ogg_free(_enc->frag_ssd);
   _ogg_free(_enc->frag_satd);
-  _ogg_free(_enc->frag_sad);
 #endif
   for(pli=3;pli-->0;){
     oc_free_2d(_enc->extra_bits[pli]);
@@ -1258,8 +1255,6 @@ static void oc_enc_drop_frame(th_enc_ctx *_enc){
   /*Use the previous frame's reconstruction.*/
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=
    _enc->state.ref_frame_idx[OC_FRAME_PREV];
-  _enc->state.ref_frame_data[OC_FRAME_SELF]=
-   _enc->state.ref_frame_data[OC_FRAME_PREV];
   /*Flag motion vector analysis about the frame drop.*/
   _enc->prevframe_dropped=1;
   /*Zero the packet.*/
@@ -1695,37 +1690,27 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
     _enc->state.ref_frame_idx[OC_FRAME_PREV]=
      _enc->state.ref_frame_idx[OC_FRAME_SELF];
-    _enc->state.ref_frame_data[OC_FRAME_PREV]=
-     _enc->state.ref_frame_data[OC_FRAME_SELF];
     if(_enc->state.frame_type==OC_INTRA_FRAME){
       /*The new frame becomes both the previous and gold reference frames.*/
       _enc->state.keyframe_num=_enc->state.curframe_num;
       _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
        _enc->state.ref_frame_idx[OC_FRAME_SELF];
-      _enc->state.ref_frame_data[OC_FRAME_GOLD]=
-       _enc->state.ref_frame_data[OC_FRAME_SELF];
     }
   }
   if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
     _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
      _enc->state.ref_frame_idx[OC_FRAME_IO];
-    _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
-     _enc->state.ref_frame_data[OC_FRAME_IO];
     if(_enc->state.frame_type==OC_INTRA_FRAME){
       /*The new input frame becomes both the previous and gold
          original-reference frames.*/
       _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
        _enc->state.ref_frame_idx[OC_FRAME_IO];
-      _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
-       _enc->state.ref_frame_data[OC_FRAME_IO];
     }
   }
   /*Select a free buffer to use for the incoming frame*/
   for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
    refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
   _enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
-  _enc->state.ref_frame_data[OC_FRAME_IO]=
-   _enc->state.ref_frame_bufs[refi][0].data;
   /*Step 3: Copy the input to our internal buffer.
     This lets us add padding, so we don't have to worry about dereferencing
      possibly invalid addresses, and allows us to use the same strides and
@@ -1744,8 +1729,6 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
    refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
-  _enc->state.ref_frame_data[OC_FRAME_SELF]=
-   _enc->state.ref_frame_bufs[refi][0].data;
   _enc->state.curframe_num+=_enc->prev_dup_count+1;
   /*Step 4: Compress the frame.*/
   /*Start with a keyframe, and don't allow the generation of invalid files that

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/enquant.c

@@ -229,7 +229,7 @@ int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
   enquant=(const oc_iquant *)_enquant;
   nonzero=0;
   for(zzi=0;zzi<64;zzi++){
-    val=_dct[zzi];
+    val=_dct[OC_FZIG_ZAG[zzi]];
     d=_dequant[zzi];
     val=val<<1;
     if(abs(val)>=d){

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/fdct.c

@@ -147,7 +147,7 @@ void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   /*Round the result back to the external working precision (which is still
      scaled by four relative to the orthogonal result).
     TODO: We should just update the external working precision.*/
-  for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
+  for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
 }
 
 

+ 21 - 11
love/src/jni/libtheora-1.2.0alpha1/lib/huffdec.c

@@ -377,12 +377,12 @@ static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
    representation.
   _opb:   The buffer to unpack the trees from.
   _nodes: The table to fill with the Huffman trees.
-  Return: 0 on success, or a negative value on error.
-          The caller is responsible for cleaning up any partially initialized
-           _nodes on failure.*/
+  Return: 0 on success, or a negative value on error.*/
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
  ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
+  int ret;
   int i;
+  ret=0;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
     unsigned char  tokens[256][2];
     int            ntokens;
@@ -390,19 +390,29 @@ int oc_huff_trees_unpack(oc_pack_buf *_opb,
     size_t         size;
     /*Unpack the full tree into a temporary buffer.*/
     ntokens=oc_huff_tree_unpack(_opb,tokens);
-    if(ntokens<0)return ntokens;
+    if(ntokens<0){
+      ret=ntokens;
+      break;
+    }
     /*Figure out how big the collapsed tree will be and allocate space for it.*/
     size=oc_huff_tree_collapse(NULL,tokens,ntokens);
-    /*This should never happen; if it does it means you set OC_HUFF_SLUSH or
-       OC_ROOT_HUFF_SLUSH too large.*/
-    if(size>32767)return TH_EIMPL;
+    if(size>32767){
+      /*This should never happen; if it does it means you set OC_HUFF_SLUSH or
+         OC_ROOT_HUFF_SLUSH too large.*/
+      ret=TH_EIMPL;
+      break;
+    }
     tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
-    if(tree==NULL)return TH_EFAULT;
+    if(tree==NULL){
+      ret=TH_EFAULT;
+      break;
+    }
     /*Construct the collapsed the tree.*/
     oc_huff_tree_collapse(tree,tokens,ntokens);
     _nodes[i]=tree;
   }
-  return 0;
+  if(ret<0)while(i-->0)_ogg_free(_nodes[i]);
+  return ret;
 }
 
 /*Determines the size in words of a Huffman subtree.
@@ -465,7 +475,7 @@ void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   _opb:  The buffer to unpack the token from.
   _node: The tree to unpack the token with.
   Return: The token value.*/
-int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+int oc_huff_token_decode(oc_pack_buf *_opb,const ogg_int16_t *_tree){
   const unsigned char *ptr;
   const unsigned char *stop;
   oc_pb_window         window;
@@ -487,7 +497,7 @@ int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
         /*We don't bother setting eof because we won't check for it after we've
            started decoding DCT tokens.*/
         if(ptr>=stop){
-          shift=(unsigned)-OC_LOTS_OF_BITS;
+          shift=-OC_LOTS_OF_BITS;
           break;
         }
         shift-=8;

+ 4 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/huffenc.c

@@ -918,13 +918,16 @@ int oc_huff_codes_pack(oggpack_buffer *_opb,
    codebooks.*/
 int oc_huff_codes_unpack(oc_pack_buf *_opb,
  th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int ret;
   int i;
+  ret=0;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
     ogg_uint32_t code;
     int          len;
+    int          ntokens;
     int          nleaves;
     code=0;
-    len=nleaves=0;
+    len=ntokens=nleaves=0;
     memset(_codes[i],0,TH_NDCT_TOKENS*sizeof(*_codes[i]));
     for(;;){
       long bits;

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/huffman.h

@@ -16,7 +16,7 @@
  ********************************************************************/
 
 #if !defined(_huffman_H)
-# define _huffman_H (1)
+# define _hufffman_H (1)
 # include "theora/codec.h"
 # include "ocintrin.h"
 

+ 5 - 6
love/src/jni/libtheora-1.2.0alpha1/lib/idct.c

@@ -241,8 +241,8 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  /*Clear input data for next block.*/
-  _x[0]=_x[1]=_x[8]=0;
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -272,8 +272,8 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  /*Clear input data for next block.*/
-  _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -291,8 +291,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   for(i=0;i<8;i++)idct8(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  /*Clear input data for next block.*/
-  for(i=0;i<64;i++)_x[i]=0;
+  if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.

+ 4 - 4
love/src/jni/libtheora-1.2.0alpha1/lib/info.c

@@ -54,7 +54,7 @@ void th_comment_init(th_comment *_tc){
   memset(_tc,0,sizeof(*_tc));
 }
 
-void th_comment_add(th_comment *_tc,const char *_comment){
+void th_comment_add(th_comment *_tc,char *_comment){
   char **user_comments;
   int   *comment_lengths;
   int    comment_len;
@@ -75,7 +75,7 @@ void th_comment_add(th_comment *_tc,const char *_comment){
   _tc->user_comments[_tc->comments]=NULL;
 }
 
-void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){
+void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
   char *comment;
   int   tag_len;
   int   val_len;
@@ -91,7 +91,7 @@ void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){
   _ogg_free(comment);
 }
 
-char *th_comment_query(th_comment *_tc,const char *_tag,int _count){
+char *th_comment_query(th_comment *_tc,char *_tag,int _count){
   long i;
   int  found;
   int  tag_len;
@@ -107,7 +107,7 @@ char *th_comment_query(th_comment *_tc,const char *_tag,int _count){
   return NULL;
 }
 
-int th_comment_query_count(th_comment *_tc,const char *_tag){
+int th_comment_query_count(th_comment *_tc,char *_tag){
   long i;
   int  tag_len;
   int  count;

+ 3 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/internal.c

@@ -99,7 +99,7 @@ int oc_ilog(unsigned _v){
 
 void *oc_aligned_malloc(size_t _sz,size_t _align){
   unsigned char *p;
-  if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
+  if(_align>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
   p=(unsigned char *)_ogg_malloc(_sz+_align);
   if(p!=NULL){
     int offs;
@@ -131,6 +131,7 @@ void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
   datsz=rowsz*_height;
   /*Alloc array and row pointers.*/
   ret=(char *)_ogg_malloc(datsz+colsz);
+  if(ret==NULL)return NULL;
   /*Initialize the array.*/
   if(ret!=NULL){
     size_t   i;
@@ -153,6 +154,7 @@ void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){
   datsz=rowsz*_height;
   /*Alloc array and row pointers.*/
   ret=(char *)_ogg_calloc(datsz+colsz,1);
+  if(ret==NULL)return NULL;
   /*Initialize the array.*/
   if(ret!=NULL){
     size_t   i;

+ 5 - 12
love/src/jni/libtheora-1.2.0alpha1/lib/internal.h

@@ -25,15 +25,6 @@
 # include "theora/theora.h"
 # include "ocintrin.h"
 
-# if !defined(__GNUC_PREREQ)
-#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
-#   define __GNUC_PREREQ(_maj,_min) \
- ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
-#  else
-#   define __GNUC_PREREQ(_maj,_min) 0
-#  endif
-# endif
-
 # if defined(_MSC_VER)
 /*Disable missing EMMS warnings.*/
 #  pragma warning(disable:4799)
@@ -41,8 +32,10 @@
 #  pragma warning(disable:4554)
 # endif
 /*You, too, gcc.*/
-# if __GNUC_PREREQ(4,2)
-#  pragma GCC diagnostic ignored "-Wparentheses"
+# if defined(__GNUC_PREREQ)
+#  if __GNUC_PREREQ(4,2)
+#   pragma GCC diagnostic ignored "-Wparentheses"
+#  endif
 # endif
 
 /*Some assembly constructs require aligned operands.
@@ -72,7 +65,7 @@
 
 
 /*This library's version.*/
-# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"
+# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100923 (Ptalarbvorm)"
 
 /*Theora bitstream version.*/
 # define TH_VERSION_MAJOR (3)

+ 1 - 1
love/src/jni/libtheora-1.2.0alpha1/lib/mathops.c

@@ -1,5 +1,5 @@
-#include "internal.h"
 #include "mathops.h"
+#include <limits.h>
 
 /*The fastest fallback strategy for platforms with fast multiplication appears
    to be based on de Bruijn sequences~\cite{LP98}.

+ 21 - 19
love/src/jni/libtheora-1.2.0alpha1/lib/mathops.h

@@ -2,27 +2,29 @@
 # define _mathops_H (1)
 # include <ogg/ogg.h>
 
-# if __GNUC_PREREQ(3,4)
-#  include <limits.h>
+# ifdef __GNUC_PREREQ
+#  if __GNUC_PREREQ(3,4)
+#   include <limits.h>
 /*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
    "upgrading" the type of an entire expression to an (unsigned) size_t.*/
-#  if INT_MAX>=2147483647
-#   define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#   define OC_CLZ32(_x) (__builtin_clz(_x))
-#  elif LONG_MAX>=2147483647L
-#   define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#   define OC_CLZ32(_x) (__builtin_clzl(_x))
-#  endif
-#  if INT_MAX>=9223372036854775807LL
-#   define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
-#   define OC_CLZ64(_x) (__builtin_clz(_x))
-#  elif LONG_MAX>=9223372036854775807LL
-#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
-#   define OC_CLZ64(_x) (__builtin_clzl(_x))
-#  elif LLONG_MAX>=9223372036854775807LL|| \
-    __LONG_LONG_MAX__>=9223372036854775807LL
-#   define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
-#   define OC_CLZ64(_x) (__builtin_clzll(_x))
+#   if INT_MAX>=2147483647
+#    define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#    define OC_CLZ32(_x) (__builtin_clz(_x))
+#   elif LONG_MAX>=2147483647L
+#    define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#    define OC_CLZ32(_x) (__builtin_clzl(_x))
+#   endif
+#   if INT_MAX>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clz(_x))
+#   elif LONG_MAX>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clzl(_x))
+#   elif LLONG_MAX>=9223372036854775807LL|| \
+     __LONG_LONG_MAX__>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clzll(_x))
+#   endif
 #  endif
 # endif
 

+ 53 - 68
love/src/jni/libtheora-1.2.0alpha1/lib/mcenc.c

@@ -88,7 +88,7 @@ static const int OC_SQUARE_SITES[11][8]={
 };
 
 
-static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
+static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
  oc_mv _accum,int _mbi,int _frame){
   oc_mb_enc_info *embs;
   int             accum_x;
@@ -115,8 +115,8 @@ static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
   accum_y=OC_MV_Y(_accum);
   /*Add a few additional vectors to set A: the vectors used in the previous
      frames and the (0,0) vector.*/
-  _mcenc->candidates[ncandidates][0]=accum_x;
-  _mcenc->candidates[ncandidates][1]=accum_y;
+  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,accum_x,31);
+  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,accum_y,31);
   ncandidates++;
   _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
    OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31);
@@ -137,33 +137,30 @@ static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
   OC_SORT2I(a[0][1],a[1][1]);
   _mcenc->candidates[0][0]=a[1][0];
   _mcenc->candidates[0][1]=a[1][1];
-  _mcenc->setb0=ncandidates;
-}
-
-static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
- oc_mv _accum,int _mbi,int _frame){
-  oc_mb_enc_info *embs;
-  int             accum_x;
-  int             accum_y;
-  int             ncandidates;
-  embs=_enc->mb_info;
-  accum_x=OC_MV_X(_accum);
-  accum_y=OC_MV_Y(_accum);
   /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
-  ncandidates=_mcenc->setb0;
-  /*Use only the current block. Using more did not appear to be helpful
-    with the current selection logic due to escaping the local search too
-    quickly.*/
-  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
-   2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
-   -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
-  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
-   2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
-   -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
-  ncandidates++;
+  _mcenc->setb0=ncandidates;
+  /*The first time through the loop use the current macro block.*/
+  nmbi=_mbi;
+  for(i=0;;i++){
+    _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
+     2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
+     -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
+    _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
+     2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
+     -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
+    ncandidates++;
+    if(i>=embs[_mbi].npneighbors)break;
+    nmbi=embs[_mbi].pneighbors[i];
+  }
+  /*Truncate to full-pel positions.*/
+  for(i=0;i<ncandidates;i++){
+    _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]);
+    _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]);
+  }
   _mcenc->ncandidates=ncandidates;
 }
 
+#if 0
 static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
  const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
  int _mvoffset0,int _mvoffset1,const unsigned char *_src,
@@ -179,13 +176,14 @@ static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
   }
   return err;
 }
+#endif
 
 static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
  const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
  int _mvoffset0,int _mvoffset1,const unsigned char *_src,
  const unsigned char *_ref,int _ystride,unsigned _best_err){
   unsigned err;
-  int      dc;
+  unsigned dc;
   int      bi;
   err=0;
   for(bi=0;bi<4;bi++){
@@ -193,7 +191,7 @@ static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
     frag_offs=_frag_buf_offs[_fragis[bi]];
     err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
      _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
-    err+=abs(dc);
+    err+=dc;
   }
   return err;
 }
@@ -229,17 +227,11 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
   err=0;
   for(bi=0;bi<4;bi++){
     ptrdiff_t frag_offs;
-    int       dc;
+    unsigned  dc;
     frag_offs=_frag_buf_offs[_fragis[bi]];
-    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-      err+=oc_enc_frag_satd(_enc,&dc,
-       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
-      err+=abs(dc);
-    }
-    else{
-      err+=oc_enc_frag_sad(_enc,
-       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
-    }
+    err+=oc_enc_frag_satd(_enc,&dc,
+     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
+    err+=dc;
   }
   return err;
 }
@@ -248,10 +240,10 @@ static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
  ptrdiff_t _frag_offs,int _dx,int _dy,
  const unsigned char *_src,const unsigned char *_ref,int _ystride){
   unsigned err;
-  int      dc;
+  unsigned dc;
   err=oc_enc_frag_satd(_enc,&dc,
    _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
-  return err+abs(dc);
+  return err+dc;
 }
 
 /*Perform a motion vector search for this macro block against a single
@@ -303,18 +295,18 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
   int                  bi;
   embs=_enc->mb_info;
   /*Find some candidate motion vectors.*/
-  oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame);
+  oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame);
   /*Clear the cache of locations we've examined.*/
   memset(hit_cache,0,sizeof(hit_cache));
   /*Start with the median predictor.*/
-  candx=OC_DIV2(mcenc.candidates[0][0]);
-  candy=OC_DIV2(mcenc.candidates[0][1]);
+  candx=mcenc.candidates[0][0];
+  candy=mcenc.candidates[0][1];
   hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_frame_full];
-  satd_ref=_enc->state.ref_frame_data[_frame];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame_full]];
+  satd_ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
   ystride=_enc->state.ref_ystride[0];
   /*TODO: customize error function for speed/(quality+size) tradeoff.*/
   best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
@@ -343,8 +335,8 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
     t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
     /*Examine the candidates in set A.*/
     for(ci=1;ci<mcenc.setb0;ci++){
-      candx=OC_DIV2(mcenc.candidates[ci][0]);
-      candy=OC_DIV2(mcenc.candidates[ci][1]);
+      candx=mcenc.candidates[ci][0];
+      candy=mcenc.candidates[ci][1];
       /*If we've already examined this vector, then we would be using it if it
          was better than what we are using.*/
       hitbit=(ogg_int32_t)1<<candx+15;
@@ -366,11 +358,10 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
       }
     }
     if(best_err>t2){
-      oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame);
       /*Examine the candidates in set B.*/
       for(;ci<mcenc.ncandidates;ci++){
-        candx=OC_DIV2(mcenc.candidates[ci][0]);
-        candy=OC_DIV2(mcenc.candidates[ci][1]);
+        candx=mcenc.candidates[ci][0];
+        candy=mcenc.candidates[ci][1];
         hitbit=(ogg_int32_t)1<<candx+15;
         if(hit_cache[candy+15]&hitbit)continue;
         hit_cache[candy+15]|=hitbit;
@@ -504,7 +495,7 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
   embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
    frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride);
   embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1);
-  if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+  if(_frame==OC_FRAME_PREV){
     for(bi=0;bi<4;bi++){
       candx=best_block_vec[bi][0];
       candy=best_block_vec[bi][1];
@@ -561,8 +552,8 @@ static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
   int                  best_site;
   int                  sitei;
   int                  err;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_framei];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -616,8 +607,8 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
   int                  best_site;
   int                  sitei;
   int                  err;
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[_frame];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -646,14 +637,8 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
     ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
-      err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
-       mvoffset0,mvoffset1,src,ref,ystride,_best_err);
-    }
-    else{
-      err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
-           mvoffset0,mvoffset1,src,ref,ystride,_best_err);
-    }
+    err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
+     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
     if(err<_best_err){
       _best_err=err;
       best_site=site;
@@ -728,7 +713,7 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
   best_site=4;
   for(sitei=0;sitei<8;sitei++){
     unsigned err;
-    int      dc;
+    unsigned dc;
     int      site;
     int      xmask;
     int      ymask;
@@ -750,7 +735,7 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
     mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
     err=oc_enc_frag_satd2(_enc,&dc,_src,
      _ref+mvoffset0,_ref+mvoffset1,_ystride);
-    err+=abs(dc);
+    err+=dc;
     if(err<_best_err){
       _best_err=err;
       best_site=site;
@@ -773,8 +758,8 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
-  src=_enc->state.ref_frame_data[OC_FRAME_IO];
-  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
   offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
   offset_y[3]=offset_y[5]=0;
   offset_y[6]=offset_y[7]=offset_y[8]=ystride;

+ 1 - 503
love/src/jni/libtheora-1.2.0alpha1/lib/modedec.h

@@ -26,7 +26,7 @@ ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={
 # if !defined(OC_COLLECT_METRICS)
 static const
 # endif
-oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={
+oc_mode_rd OC_MODE_RD[OC_LOGQ_BINS][3][2][OC_SAD_BINS]={
   {
     {
       /*Y'  qi=0  INTRA*/
@@ -525,506 +525,4 @@ oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={
   }
 };
 
-# if !defined(OC_COLLECT_METRICS)
-static const
-# endif
-oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={
-  {
-    {
-      /*Y'  qi=0  INTRA*/
-      {
-        {   33,  122},{   57, 1297},{   13, 2226},{  157, 3890},
-        {  227, 3682},{  169, 3084},{  197, 2700},{  227, 3238},
-        {  290, 4294},{  354, 5230},{  406, 5615},{  417, 5322},
-        {  452, 5462},{  455, 5683},{  493, 5938},{  553, 6374},
-        {  558, 6464},{  606, 6493},{  616, 6417},{  643, 6557},
-        {  641, 6664},{  716, 7285},{  748, 7518},{  747, 7502}
-      },
-      /*Y'  qi=0  INTER*/
-      {
-        {   16,  205},{    5, 1338},{   16, 2554},{    6, 3809},
-        {    9, 5188},{   58, 6446},{   76, 7561},{   95, 8648},
-        {  124, 9713},{  158,10787},{  193,11887},{  233,12991},
-        {  270,14116},{  307,15236},{  341,16346},{  372,17426},
-        {  398,18499},{  422,19594},{  448,20669},{  479,21732},
-        {  526,22720},{  583,23572},{  655,24516},{  758,24647}
-      }
-    },
-    {
-      /*Cb  qi=0  INTRA*/
-      {
-        {   26,   40},{   23,  589},{   27,  784},{   27, 1079},
-        {   24, 1186},{   25, 1641},{   25, 1915},{   29, 2207},
-        {   39, 2361},{   39, 2746},{   32, 3020},{   16, 3387},
-        {   31, 3604},{   36, 4076},{   69, 4426},{  102, 4724},
-        {  139, 4923},{  196, 5061},{  211, 5103},{  214, 5063},
-        {  161, 4466},{  208, 4793},{  218, 4537},{  219, 4539}
-      },
-      /*Cb  qi=0  INTER*/
-      {
-        {    3,  164},{    1,  535},{    1,  779},{    2, 1048},
-        {    3, 1267},{    1, 1625},{    2, 1921},{    5, 2224},
-        {    8, 2481},{    8, 2813},{    4, 3089},{   -2, 3386},
-        {   -9, 3642},{  -14, 3993},{  -11, 4300},{   -6, 4628},
-        {    4, 4929},{   25, 5299},{   44, 5623},{   83, 5915},
-        {   93, 6186},{   91, 6483},{   90, 6775},{   95, 6952}
-      }
-    },
-    {
-      /*Cr  qi=0  INTRA*/
-      {
-        {   22,   49},{   26,  579},{   23,  762},{   15, 1050},
-        {   20, 1191},{   24, 1608},{   26, 1875},{   35, 2173},
-        {   39, 2359},{   30, 2736},{   16, 2987},{    0, 3334},
-        {   14, 3625},{   11, 4095},{   57, 4512},{   95, 4793},
-        {  141, 4949},{  206, 5242},{  230, 5191},{  242, 5177},
-        {  178, 4775},{  237, 5010},{  223, 4656},{  224, 4657}
-      },
-      /*Cr  qi=0  INTER*/
-      {
-        {    3,  163},{    1,  536},{    1,  773},{    3, 1023},
-        {    2, 1225},{    1, 1607},{    1, 1900},{    5, 2204},
-        {    9, 2453},{    8, 2781},{    3, 3049},{   -5, 3338},
-        {  -13, 3570},{  -17, 3950},{  -13, 4255},{   -6, 4596},
-        {    7, 4893},{   33, 5300},{   53, 5632},{   97, 5942},
-        {  103, 6216},{   96, 6522},{   91, 6849},{   98, 6995}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=9  INTRA*/
-      {
-        {   47,  152},{   50, 1213},{  144, 2543},{  242, 2332},
-        {  210, 1894},{  250, 2386},{  328, 3094},{  407, 3419},
-        {  464, 3507},{  522, 3770},{  613, 4194},{  657, 4618},
-        {  753, 5137},{  796, 5248},{  842, 5110},{  927, 5330},
-        {  994, 5487},{ 1008, 5463},{ 1101, 5794},{ 1169, 5966},
-        { 1208, 6121},{ 1331, 6447},{ 1445, 6618},{ 1449, 6616}
-      },
-      /*Y'  qi=9  INTER*/
-      {
-        {    4,  218},{   16, 1314},{    4, 2563},{   37, 3882},
-        {   83, 5058},{  109, 6184},{  161, 7292},{  224, 8389},
-        {  287, 9485},{  349,10565},{  411,11608},{  464,12648},
-        {  518,13664},{  575,14650},{  649,15585},{  742,16451},
-        {  862,17214},{ 1003,17860},{ 1179,18325},{ 1372,18648},
-        { 1576,18878},{ 1795,18903},{ 2040,18880},{ 2116,18759}
-      }
-    },
-    {
-      /*Cb  qi=9  INTRA*/
-      {
-        {   27,   42},{   23,  587},{   34,  782},{   37, 1079},
-        {   34, 1204},{   42, 1630},{   37, 1887},{   25, 2210},
-        {   40, 2455},{   71, 2880},{  112, 3193},{  156, 3427},
-        {  168, 3403},{  217, 3488},{  203, 3335},{  224, 3200},
-        {  191, 2742},{  195, 2810},{  207, 2665},{  201, 2661},
-        {  169, 2078},{  211, 2720},{  226, 2813},{  228, 2824}
-      },
-      /*Cb  qi=9  INTER*/
-      {
-        {    4,  158},{    2,  537},{    3,  779},{    2, 1045},
-        {    3, 1284},{    7, 1629},{    7, 1917},{    1, 2218},
-        {   -4, 2497},{   -3, 2845},{    6, 3162},{   23, 3482},
-        {   42, 3788},{   62, 4116},{   76, 4416},{   84, 4700},
-        {   91, 4975},{   95, 5259},{   97, 5518},{   94, 5790},
-        {   99, 6052},{  111, 6311},{  126, 6601},{  136, 6719}
-      }
-    },
-    {
-      /*Cr  qi=9  INTRA*/
-      {
-        {   25,   50},{   32,  576},{   32,  762},{   21, 1049},
-        {   28, 1207},{   41, 1603},{   36, 1839},{   26, 2170},
-        {   34, 2462},{   59, 2872},{  109, 3176},{  157, 3364},
-        {  188, 3397},{  231, 3418},{  250, 3341},{  261, 3228},
-        {  222, 2814},{  258, 3091},{  234, 2915},{  228, 3042},
-        {  210, 2610},{  273, 3210},{  274, 3231},{  276, 3239}
-      },
-      /*Cr  qi=9  INTER*/
-      {
-        {    4,  156},{    2,  538},{    3,  772},{    2, 1028},
-        {    3, 1254},{    7, 1613},{    7, 1893},{    0, 2191},
-        {   -8, 2454},{   -4, 2811},{    7, 3121},{   27, 3442},
-        {   48, 3749},{   72, 4101},{   88, 4410},{   91, 4698},
-        {   99, 4988},{   99, 5279},{  101, 5542},{   95, 5813},
-        {   99, 6088},{  114, 6367},{  125, 6683},{  137, 6761}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=18  INTRA*/
-      {
-        {   51,   88},{   88, 1344},{  258, 1643},{  228, 1325},
-        {  372, 2208},{  443, 2371},{  520, 2382},{  584, 2477},
-        {  739, 2906},{  859, 3348},{ 1008, 3697},{ 1131, 3884},
-        { 1278, 4110},{ 1349, 4229},{ 1431, 4329},{ 1544, 4395},
-        { 1602, 4439},{ 1669, 4535},{ 1814, 4656},{ 1883, 4716},
-        { 1957, 4940},{ 2101, 5019},{ 2259, 5249},{ 2265, 5246}
-      },
-      /*Y'  qi=18  INTER*/
-      {
-        {   26,  195},{    1, 1317},{   45, 2595},{  103, 3750},
-        {  168, 4903},{  281, 6007},{  397, 7062},{  513, 8064},
-        {  630, 9010},{  758, 9902},{  906,10732},{ 1095,11463},
-        { 1338,12060},{ 1629,12490},{ 1969,12724},{ 2313,12842},
-        { 2666,12828},{ 2993,12747},{ 3294,12670},{ 3558,12553},
-        { 3813,12440},{ 3990,12379},{ 4177,12291},{ 4226,12265}
-      }
-    },
-    {
-      /*Cb  qi=18  INTRA*/
-      {
-        {   31,   43},{   33,  585},{   40,  781},{   58, 1077},
-        {   45, 1189},{   58, 1655},{   66, 1983},{  123, 2221},
-        {  168, 2193},{  227, 2321},{  241, 2246},{  250, 2208},
-        {  221, 1786},{  250, 2087},{  247, 2036},{  250, 2164},
-        {  241, 2054},{  287, 2453},{  302, 2551},{  335, 2758},
-        {  279, 2511},{  379, 2973},{  404, 3028},{  406, 3029}
-      },
-      /*Cb  qi=18  INTER*/
-      {
-        {    7,  153},{    4,  537},{    3,  777},{    9, 1034},
-        {    6, 1282},{    0, 1630},{    0, 1943},{   21, 2252},
-        {   48, 2567},{   67, 2881},{   83, 3178},{   89, 3463},
-        {   92, 3738},{   99, 4024},{  114, 4289},{  131, 4552},
-        {  153, 4814},{  179, 5081},{  207, 5333},{  241, 5581},
-        {  273, 5822},{  303, 6068},{  335, 6368},{  353, 6432}
-      }
-    },
-    {
-      /*Cr  qi=18  INTRA*/
-      {
-        {   31,   49},{   42,  575},{   42,  763},{   38, 1045},
-        {   41, 1184},{   56, 1631},{   87, 1968},{  163, 2177},
-        {  191, 2188},{  236, 2264},{  240, 2101},{  234, 2047},
-        {  206, 1651},{  222, 1966},{  238, 2013},{  240, 2176},
-        {  229, 2098},{  321, 2592},{  341, 2748},{  378, 3025},
-        {  367, 2849},{  442, 3283},{  453, 3315},{  455, 3313}
-      },
-      /*Cr  qi=18  INTER*/
-      {
-        {    6,  151},{    3,  539},{    3,  775},{    8, 1027},
-        {    6, 1260},{   -3, 1619},{    0, 1927},{   24, 2238},
-        {   58, 2558},{   76, 2871},{   92, 3173},{   96, 3461},
-        {   98, 3742},{  104, 4032},{  116, 4306},{  136, 4578},
-        {  158, 4839},{  185, 5123},{  217, 5383},{  250, 5642},
-        {  279, 5910},{  306, 6169},{  333, 6502},{  350, 6522}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=27  INTRA*/
-      {
-        {   10,   85},{  280, 1349},{  278,  815},{  497, 1699},
-        {  600, 1569},{  744, 1944},{  894, 2114},{ 1040, 2292},
-        { 1216, 2484},{ 1485, 2816},{ 1778, 3065},{ 1990, 3243},
-        { 2199, 3381},{ 2326, 3515},{ 2370, 3422},{ 2512, 3581},
-        { 2548, 3526},{ 2656, 3615},{ 2803, 3679},{ 2946, 3766},
-        { 3023, 3824},{ 3179, 3908},{ 3374, 4035},{ 3377, 4030}
-      },
-      /*Y'  qi=27  INTER*/
-      {
-        {   -2,  172},{   31, 1347},{  117, 2488},{  245, 3651},
-        {  448, 4719},{  668, 5679},{  918, 6524},{ 1204, 7255},
-        { 1557, 7848},{ 1998, 8281},{ 2511, 8531},{ 3055, 8642},
-        { 3582, 8648},{ 4062, 8611},{ 4482, 8582},{ 4845, 8560},
-        { 5140, 8560},{ 5423, 8581},{ 5645, 8596},{ 5855, 8586},
-        { 6061, 8608},{ 6211, 8558},{ 6402, 8583},{ 6472, 8575}
-      }
-    },
-    {
-      /*Cb  qi=27  INTRA*/
-      {
-        {   47,   49},{   35,  580},{   64,  778},{   69, 1071},
-        {   98, 1289},{  186, 1556},{  177, 1654},{  197, 1736},
-        {  211, 1373},{  284, 1742},{  321, 1840},{  344, 2024},
-        {  321, 1969},{  386, 2254},{  397, 2281},{  425, 2320},
-        {  396, 2088},{  448, 2284},{  462, 2213},{  482, 2274},
-        {  410, 1894},{  513, 2310},{  546, 2332},{  549, 2334}
-      },
-      /*Cb  qi=27  INTER*/
-      {
-        {   11,  145},{    5,  539},{   11,  771},{    0, 1033},
-        {    9, 1334},{   44, 1644},{   70, 1934},{   87, 2227},
-        {   96, 2508},{  113, 2812},{  139, 3085},{  174, 3352},
-        {  216, 3614},{  261, 3873},{  305, 4123},{  349, 4372},
-        {  396, 4611},{  442, 4853},{  493, 5088},{  543, 5313},
-        {  600, 5537},{  662, 5752},{  737, 6018},{  775, 6037}
-      }
-    },
-    {
-      /*Cr  qi=27  INTRA*/
-      {
-        {   49,   52},{   57,  570},{   61,  762},{   44, 1048},
-        {   80, 1291},{  196, 1513},{  224, 1522},{  242, 1532},
-        {  213, 1293},{  260, 1639},{  253, 1691},{  291, 1915},
-        {  294, 1897},{  367, 2178},{  395, 2258},{  432, 2310},
-        {  407, 2105},{  503, 2369},{  492, 2293},{  552, 2421},
-        {  496, 2099},{  598, 2549},{  624, 2531},{  627, 2532}
-      },
-      /*Cr  qi=27  INTER*/
-      {
-        {   10,  147},{    4,  538},{   11,  769},{    0, 1022},
-        {    9, 1318},{   51, 1635},{   80, 1925},{   97, 2214},
-        {  101, 2493},{  115, 2805},{  143, 3083},{  182, 3361},
-        {  226, 3625},{  270, 3898},{  319, 4157},{  366, 4405},
-        {  418, 4649},{  467, 4904},{  509, 5157},{  548, 5412},
-        {  589, 5659},{  636, 5909},{  683, 6208},{  710, 6190}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=36  INTRA*/
-      {
-        {   86,  252},{  345,  662},{  476, 1143},{  698, 1169},
-        {  894, 1457},{ 1218, 1728},{ 1465, 1849},{ 1731, 2019},
-        { 2183, 2298},{ 2666, 2511},{ 3116, 2731},{ 3371, 2813},
-        { 3621, 2923},{ 3675, 2949},{ 3710, 2921},{ 3740, 2896},
-        { 3746, 2895},{ 3886, 2978},{ 4069, 2991},{ 4229, 3016},
-        { 4338, 3102},{ 4530, 3124},{ 4751, 3248},{ 4753, 3244}
-      },
-      /*Y'  qi=36  INTER*/
-      {
-        {    0,  208},{   73, 1293},{  248, 2449},{  616, 3461},
-        { 1061, 4329},{ 1601, 4986},{ 2189, 5447},{ 2875, 5723},
-        { 3620, 5844},{ 4328, 5879},{ 4954, 5880},{ 5490, 5890},
-        { 5934, 5901},{ 6353, 5926},{ 6706, 5924},{ 7036, 5930},
-        { 7338, 5938},{ 7600, 5930},{ 7870, 5939},{ 8065, 5921},
-        { 8318, 5914},{ 8451, 5912},{ 8648, 5923},{ 8734, 5926}
-      }
-    },
-    {
-      /*Cb  qi=36  INTRA*/
-      {
-        {   52,   54},{   52,  575},{  103,  776},{  185, 1072},
-        {  172, 1069},{  211, 1302},{  217, 1413},{  285, 1586},
-        {  330, 1463},{  453, 1694},{  500, 1741},{  545, 1852},
-        {  501, 1650},{  584, 1874},{  587, 1856},{  638, 1919},
-        {  581, 1742},{  670, 1953},{  688, 1934},{  731, 2030},
-        {  637, 1794},{  806, 2123},{  840, 2091},{  843, 2091}
-      },
-      /*Cb  qi=36  INTER*/
-      {
-        {   19,  142},{   17,  534},{    6,  772},{   44, 1023},
-        {   82, 1296},{   94, 1614},{  117, 1903},{  158, 2187},
-        {  218, 2450},{  285, 2703},{  352, 2943},{  421, 3181},
-        {  489, 3415},{  564, 3644},{  647, 3861},{  748, 4060},
-        {  861, 4246},{  993, 4419},{ 1132, 4576},{ 1282, 4744},
-        { 1445, 4894},{ 1600, 5034},{ 1782, 5211},{ 1837, 5200}
-      }
-    },
-    {
-      /*Cr  qi=36  INTRA*/
-      {
-        {   62,   55},{   90,  561},{   56,  767},{  148, 1014},
-        {  207,  981},{  258, 1216},{  273, 1253},{  326, 1392},
-        {  338, 1383},{  417, 1613},{  443, 1629},{  497, 1734},
-        {  466, 1525},{  561, 1778},{  577, 1787},{  631, 1892},
-        {  591, 1706},{  715, 1980},{  730, 1958},{  822, 2113},
-        {  755, 1935},{  928, 2228},{  935, 2205},{  938, 2205}
-      },
-      /*Cr  qi=36  INTER*/
-      {
-        {   14,  145},{   16,  535},{    5,  772},{   44, 1017},
-        {   91, 1296},{  100, 1605},{  122, 1891},{  163, 2174},
-        {  225, 2443},{  294, 2707},{  362, 2962},{  436, 3210},
-        {  518, 3437},{  607, 3664},{  702, 3876},{  795, 4094},
-        {  886, 4310},{  980, 4538},{ 1089, 4749},{ 1216, 4927},
-        { 1357, 5116},{ 1506, 5247},{ 1758, 5338},{ 1787, 5306}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=45  INTRA*/
-      {
-        {  185,  246},{  513,  647},{  883,  891},{ 1313, 1142},
-        { 1760, 1351},{ 2368, 1595},{ 2828, 1718},{ 3097, 1780},
-        { 3762, 1951},{ 4454, 2121},{ 4986, 2227},{ 5281, 2281},
-        { 5477, 2299},{ 5431, 2288},{ 5425, 2283},{ 5439, 2290},
-        { 5324, 2249},{ 5509, 2279},{ 5703, 2321},{ 5896, 2348},
-        { 6049, 2370},{ 6253, 2425},{ 6415, 2432},{ 6419, 2430}
-      },
-      /*Y'  qi=45  INTER*/
-      {
-        {    6,  215},{  152, 1261},{  691, 2314},{ 1538, 3095},
-        { 2505, 3632},{ 3475, 3935},{ 4355, 4084},{ 5209, 4139},
-        { 5985, 4162},{ 6644, 4185},{ 7235, 4190},{ 7768, 4196},
-        { 8266, 4200},{ 8736, 4210},{ 9143, 4207},{ 9511, 4215},
-        { 9828, 4209},{10112, 4224},{10374, 4226},{10642, 4232},
-        {10842, 4219},{10971, 4208},{11200, 4211},{11299, 4216}
-      }
-    },
-    {
-      /*Cb  qi=45  INTRA*/
-      {
-        {   58,   71},{   66,  548},{  155,  762},{  213,  944},
-        {  192,  731},{  324, 1147},{  401, 1366},{  481, 1480},
-        {  508, 1238},{  657, 1522},{  727, 1563},{  794, 1611},
-        {  761, 1470},{  885, 1710},{  893, 1700},{  958, 1760},
-        {  893, 1543},{  985, 1719},{ 1014, 1732},{ 1082, 1784},
-        {  963, 1519},{ 1152, 1800},{ 1221, 1830},{ 1226, 1830}
-      },
-      /*Cb  qi=45  INTER*/
-      {
-        {   35,  135},{   12,  532},{   54,  769},{  106, 1007},
-        {  127, 1258},{  198, 1565},{  289, 1832},{  398, 2082},
-        {  520, 2302},{  653, 2511},{  800, 2705},{  956, 2897},
-        { 1143, 3064},{ 1358, 3220},{ 1623, 3335},{ 1913, 3444},
-        { 2198, 3534},{ 2502, 3626},{ 2787, 3711},{ 3114, 3783},
-        { 3454, 3831},{ 3711, 3871},{ 4163, 3901},{ 4221, 3890}
-      }
-    },
-    {
-      /*Cr  qi=45  INTRA*/
-      {
-        {   93,   68},{   72,  541},{  154,  769},{  239,  848},
-        {  214,  623},{  377, 1060},{  437, 1200},{  514, 1280},
-        {  512, 1160},{  625, 1453},{  657, 1470},{  718, 1516},
-        {  692, 1331},{  831, 1617},{  875, 1609},{  944, 1678},
-        {  886, 1469},{ 1061, 1699},{ 1082, 1714},{ 1226, 1823},
-        { 1113, 1581},{ 1324, 1872},{ 1370, 1925},{ 1374, 1924}
-      },
-      /*Cr  qi=45  INTER*/
-      {
-        {   31,  140},{   13,  533},{   52,  770},{  109, 1000},
-        {  134, 1253},{  201, 1555},{  298, 1821},{  411, 2076},
-        {  525, 2314},{  659, 2545},{  828, 2747},{ 1019, 2918},
-        { 1205, 3082},{ 1405, 3266},{ 1609, 3443},{ 1847, 3606},
-        { 2085, 3730},{ 2404, 3835},{ 2709, 3876},{ 3049, 3886},
-        { 3381, 3821},{ 3708, 3780},{ 4026, 3663},{ 4043, 3646}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=54  INTRA*/
-      {
-        {  316,  203},{  720,  585},{ 1596, 1077},{ 2316, 1289},
-        { 2687, 1439},{ 3133, 1593},{ 3495, 1706},{ 3836, 1775},
-        { 4249, 1892},{ 4804, 2031},{ 5320, 2139},{ 5617, 2203},
-        { 5726, 2199},{ 5726, 2176},{ 5682, 2146},{ 5677, 2127},
-        { 5717, 2124},{ 5707, 2129},{ 5853, 2148},{ 6110, 2180},
-        { 6454, 2247},{ 6714, 2287},{ 6845, 2304},{ 6854, 2303}
-      },
-      /*Y'  qi=54  INTER*/
-      {
-        {  -48,  217},{  314, 1261},{ 1450, 2126},{ 2761, 2728},
-        { 4275, 3012},{ 5408, 3167},{ 6305, 3245},{ 7165, 3290},
-        { 7966, 3325},{ 8698, 3359},{ 9352, 3377},{ 9907, 3391},
-        {10389, 3390},{10856, 3395},{11170, 3385},{11530, 3385},
-        {11780, 3362},{12018, 3362},{12266, 3361},{12443, 3339},
-        {12683, 3342},{12713, 3317},{12967, 3325},{13082, 3332}
-      }
-    },
-    {
-      /*Cb  qi=54  INTRA*/
-      {
-        {   94,   73},{   83,  557},{  152,  818},{  304,  919},
-        {  341,  819},{  506, 1128},{  593, 1281},{  700, 1389},
-        {  714, 1225},{  907, 1502},{  981, 1549},{ 1062, 1641},
-        { 1032, 1523},{ 1170, 1710},{ 1217, 1727},{ 1258, 1714},
-        { 1216, 1575},{ 1309, 1682},{ 1331, 1656},{ 1393, 1712},
-        { 1247, 1456},{ 1469, 1728},{ 1530, 1711},{ 1532, 1711}
-      },
-      /*Cb  qi=54  INTER*/
-      {
-        {   33,  133},{   12,  532},{   70,  770},{  171,  996},
-        {  279, 1233},{  427, 1503},{  600, 1736},{  824, 1939},
-        { 1101, 2097},{ 1411, 2237},{ 1735, 2374},{ 2097, 2493},
-        { 2486, 2606},{ 2916, 2691},{ 3297, 2771},{ 3715, 2826},
-        { 4088, 2855},{ 4460, 2886},{ 4849, 2911},{ 5198, 2932},
-        { 5489, 2940},{ 5875, 2981},{ 6208, 3017},{ 6270, 3012}
-      }
-    },
-    {
-      /*Cr  qi=54  INTRA*/
-      {
-        {  103,   63},{   83,  580},{  258,  796},{  301,  802},
-        {  361,  675},{  538, 1001},{  625, 1097},{  713, 1171},
-        {  699, 1103},{  868, 1380},{  915, 1400},{  970, 1491},
-        {  923, 1365},{ 1070, 1603},{ 1154, 1655},{ 1206, 1677},
-        { 1157, 1541},{ 1366, 1736},{ 1391, 1723},{ 1506, 1797},
-        { 1388, 1556},{ 1616, 1828},{ 1655, 1797},{ 1658, 1796}
-      },
-      /*Cr  qi=54  INTER*/
-      {
-        {   30,  138},{   14,  532},{   63,  771},{  176,  990},
-        {  299, 1226},{  438, 1496},{  606, 1735},{  814, 1950},
-        { 1089, 2127},{ 1417, 2281},{ 1761, 2421},{ 2104, 2571},
-        { 2467, 2701},{ 2881, 2827},{ 3303, 2900},{ 3735, 2917},
-        { 4183, 2913},{ 4529, 2882},{ 4915, 2844},{ 5168, 2796},
-        { 5410, 2763},{ 5562, 2753},{ 5815, 2764},{ 5832, 2755}
-      }
-    }
-  },
-  {
-    {
-      /*Y'  qi=63  INTRA*/
-      {
-        {  421,  194},{ 1272,  564},{ 3016,  943},{ 3831, 1079},
-        { 4282, 1174},{ 4799, 1290},{ 5166, 1348},{ 5259, 1350},
-        { 5720, 1426},{ 6501, 1539},{ 7048, 1606},{ 7328, 1642},
-        { 7374, 1622},{ 7349, 1612},{ 7192, 1578},{ 7207, 1571},
-        { 7161, 1555},{ 7259, 1573},{ 7432, 1592},{ 7710, 1613},
-        { 8167, 1672},{ 8425, 1697},{ 8597, 1710},{ 8602, 1710}
-      },
-      /*Y'  qi=63  INTER*/
-      {
-        { -584,  286},{ 1231, 1186},{ 3939, 1663},{ 6096, 1865},
-        { 7849, 1929},{ 8934, 1995},{ 9962, 2039},{11038, 2078},
-        {12016, 2092},{12889, 2100},{13617, 2096},{14221, 2089},
-        {14743, 2083},{15240, 2081},{15619, 2074},{15992, 2065},
-        {16314, 2065},{16529, 2059},{16822, 2056},{17041, 2049},
-        {17321, 2052},{17408, 2043},{17670, 2051},{17801, 2053}
-      }
-    },
-    {
-      /*Cb  qi=63  INTRA*/
-      {
-        {  154,   55},{  280,  582},{  507,  731},{  788,  853},
-        {  763,  738},{ 1141, 1008},{ 1323, 1090},{ 1540, 1220},
-        { 1487, 1089},{ 1861, 1322},{ 1983, 1347},{ 2145, 1425},
-        { 2047, 1317},{ 2334, 1475},{ 2352, 1413},{ 2458, 1467},
-        { 2243, 1270},{ 2464, 1413},{ 2423, 1335},{ 2506, 1385},
-        { 2182, 1180},{ 2565, 1376},{ 2555, 1321},{ 2557, 1321}
-      },
-      /*Cb  qi=63  INTER*/
-      {
-        {   34,  133},{    6,  531},{  139,  767},{  344,  975},
-        {  608, 1180},{ 1048, 1367},{ 1651, 1495},{ 2376, 1572},
-        { 3103, 1609},{ 3752, 1646},{ 4373, 1680},{ 4980, 1718},
-        { 5540, 1744},{ 6023, 1764},{ 6431, 1766},{ 6800, 1769},
-        { 7149, 1775},{ 7529, 1777},{ 7920, 1817},{ 8198, 1808},
-        { 8691, 1848},{ 8965, 1845},{ 9372, 1865},{ 9459, 1863}
-      }
-    },
-    {
-      /*Cr  qi=63  INTRA*/
-      {
-        {  121,   59},{  392,  570},{  609,  654},{  800,  760},
-        {  720,  598},{ 1192,  892},{ 1298,  897},{ 1470, 1027},
-        { 1411,  962},{ 1761, 1184},{ 1826, 1197},{ 1981, 1308},
-        { 1854, 1198},{ 2229, 1427},{ 2269, 1365},{ 2428, 1453},
-        { 2217, 1265},{ 2558, 1435},{ 2541, 1356},{ 2660, 1417},
-        { 2337, 1199},{ 2688, 1382},{ 2603, 1301},{ 2605, 1300}
-      },
-      /*Cr  qi=63  INTER*/
-      {
-        {   31,  137},{   10,  531},{  136,  768},{  360,  971},
-        {  638, 1166},{ 1029, 1373},{ 1604, 1519},{ 2351, 1595},
-        { 3129, 1640},{ 3861, 1691},{ 4491, 1751},{ 5101, 1783},
-        { 5635, 1784},{ 6136, 1779},{ 6550, 1763},{ 6905, 1746},
-        { 7172, 1726},{ 7495, 1732},{ 7738, 1735},{ 7949, 1735},
-        { 8211, 1744},{ 8424, 1740},{ 8779, 1764},{ 8812, 1760}
-      }
-    }
-  }
-};
-
 #endif

+ 2 - 11
love/src/jni/libtheora-1.2.0alpha1/lib/rate.c

@@ -762,7 +762,6 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
       _enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale);
       _enc->rc.cur_metrics.dup_count=_enc->dup_count;
       _enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
-      _enc->rc.cur_metrics.activity_avg=_enc->activity_avg;
       _enc->rc.twopass_buffer_bytes=0;
     }break;
     case 2:{
@@ -865,9 +864,9 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
   return dropped;
 }
 
-#define OC_RC_2PASS_VERSION   (2)
+#define OC_RC_2PASS_VERSION   (1)
 #define OC_RC_2PASS_HDR_SZ    (38)
-#define OC_RC_2PASS_PACKET_SZ (12)
+#define OC_RC_2PASS_PACKET_SZ (8)
 
 static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
   while(_bytes-->0){
@@ -902,7 +901,6 @@ int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){
       oc_rc_buffer_val(&_enc->rc,
        _enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
       oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4);
-      oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.activity_avg,4);
     }
   }
   else if(_enc->packet_state==OC_PACKET_DONE&&
@@ -1053,19 +1051,16 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
         if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
           ogg_uint32_t dup_count;
           ogg_int32_t  log_scale;
-          unsigned     activity;
           int          qti;
           int          arg;
           /*Read the metrics for the next frame.*/
           dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
           log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
-          activity=oc_rc_unbuffer_val(&_enc->rc,4);
           _enc->rc.cur_metrics.log_scale=log_scale;
           qti=(dup_count&0x80000000)>>31;
           _enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
           _enc->rc.cur_metrics.frame_type=qti;
           _enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
-          _enc->activity_avg=_enc->rc.cur_metrics.activity_avg=activity;
           /*"Helpfully" set the dup count back to what it was in pass 1.*/
           arg=_enc->rc.cur_metrics.dup_count;
           th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
@@ -1093,11 +1088,9 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
             ogg_uint32_t      dup_count;
             ogg_int32_t       log_scale;
             int               qti;
-            unsigned          activity;
             /*Read the metrics for the next frame.*/
             dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
             log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
-            activity=oc_rc_unbuffer_val(&_enc->rc,4);
             /*Add the to the circular buffer.*/
             fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
             if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
@@ -1106,7 +1099,6 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
             qti=(dup_count&0x80000000)>>31;
             m->dup_count=dup_count&0x7FFFFFFF;
             m->frame_type=qti;
-            m->activity_avg=activity;
             /*And accumulate the statistics over the window.*/
             _enc->rc.nframes[qti]++;
             _enc->rc.nframes[2]+=m->dup_count;
@@ -1133,7 +1125,6 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
            *(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
           _enc->rc.twopass_force_kf=
            _enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
-          _enc->activity_avg=_enc->rc.cur_metrics.activity_avg;
           /*"Helpfully" set the dup count back to what it was in pass 1.*/
           arg=_enc->rc.cur_metrics.dup_count;
           th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));

+ 26 - 41
love/src/jni/libtheora-1.2.0alpha1/lib/state.c

@@ -21,7 +21,6 @@
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
-# include "zlib.h"
 #endif
 
 /*The function used to fill in the chroma plane motion vectors for a macro
@@ -160,7 +159,7 @@ static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[],
       if(jmax>4)jmax=4;
       else if(jmax<=0)break;
       /*By default, set all fragment indices to -1.*/
-      memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
+      memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
       /*Fill in the fragment map for this super block.*/
       xfrag=yfrag+x;
       for(i=0;i<imax;i++){
@@ -254,14 +253,10 @@ static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3],
   This version is for use with no chroma decimation (4:4:4).
   This uses the already filled-in luma plane values.
   _mb_map:  The macro block map to fill.
-  _fplanes: The descriptions of the fragment planes.
-  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
-  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
+  _fplanes: The descriptions of the fragment planes.*/
 static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3],
- const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
+ const oc_fragment_plane _fplanes[3]){
   int k;
-  (void)_xfrag0;
-  (void)_yfrag0;
   for(k=0;k<4;k++){
     _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
     _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
@@ -283,7 +278,7 @@ static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={
   oc_mb_fill_cmapping00,
   oc_mb_fill_cmapping01,
   oc_mb_fill_cmapping10,
-  oc_mb_fill_cmapping11
+  (oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11
 };
 
 /*Fills in the mapping from macro blocks to their corresponding fragment
@@ -553,7 +548,6 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   int            yheight;
   int            chstride;
   int            cheight;
-  ptrdiff_t      align;
   ptrdiff_t      yoffset;
   ptrdiff_t      coffset;
   ptrdiff_t     *frag_buf_offs;
@@ -569,31 +563,26 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   vdec=!(info->pixel_fmt&2);
   yhstride=info->frame_width+2*OC_UMV_PADDING;
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  /*Require 16-byte aligned rows in the chroma planes.*/
-  chstride=(yhstride>>hdec)+15&~15;
+  chstride=yhstride>>hdec;
   cheight=yheight>>vdec;
   yplane_sz=yhstride*(size_t)yheight;
   cplane_sz=chstride*(size_t)cheight;
   yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
   coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
-  /*Although we guarantee the rows of the chroma planes are a multiple of 16
-     bytes, the initial padding on the first row may only be 8 bytes.
-    Compute the offset needed to the actual image data to a multiple of 16.*/
-  align=-coffset&15;
-  ref_frame_sz=yplane_sz+2*cplane_sz+16;
+  ref_frame_sz=yplane_sz+2*cplane_sz;
   ref_frame_data_sz=_nrefs*ref_frame_sz;
   /*Check for overflow.
     The same caveats apply as for oc_state_frarray_init().*/
-  if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz||
+  if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
    ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
     return TH_EIMPL;
   }
-  ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
+  ref_frame_data=_ogg_malloc(ref_frame_data_sz);
   frag_buf_offs=_state->frag_buf_offs=
    _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
   if(ref_frame_data==NULL||frag_buf_offs==NULL){
     _ogg_free(frag_buf_offs);
-    oc_aligned_free(ref_frame_data);
+    _ogg_free(ref_frame_data);
     return TH_EFAULT;
   }
   /*Set up the width, height and stride for the image buffers.*/
@@ -610,15 +599,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
     memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
      sizeof(_state->ref_frame_bufs[0]));
   }
-  _state->ref_frame_handle=ref_frame_data;
   /*Set up the data pointers for the image buffers.*/
   for(rfi=0;rfi<_nrefs;rfi++){
+    _state->ref_frame_data[rfi]=ref_frame_data;
     _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
-    ref_frame_data+=yplane_sz+align;
+    ref_frame_data+=yplane_sz;
     _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
     ref_frame_data+=cplane_sz;
     _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
-    ref_frame_data+=cplane_sz+(16-align);
+    ref_frame_data+=cplane_sz;
     /*Flip the buffer upside down.
       This allows us to decode Theora's bottom-up frames in their natural
        order, yet return a top-down buffer with a positive stride to the user.*/
@@ -628,7 +617,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   _state->ref_ystride[0]=-yhstride;
   _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
   /*Initialize the fragment buffer offsets.*/
-  ref_frame_data=_state->ref_frame_bufs[0][0].data;
+  ref_frame_data=_state->ref_frame_data[0];
   fragi=0;
   for(pli=0;pli<3;pli++){
     th_img_plane      *iplane;
@@ -654,25 +643,19 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
       vpix+=stride<<3;
     }
   }
-  /*Initialize the reference frame pointers and indices.*/
+  /*Initialize the reference frame indices.*/
   _state->ref_frame_idx[OC_FRAME_GOLD]=
    _state->ref_frame_idx[OC_FRAME_PREV]=
    _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
    _state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
    _state->ref_frame_idx[OC_FRAME_SELF]=
    _state->ref_frame_idx[OC_FRAME_IO]=-1;
-  _state->ref_frame_data[OC_FRAME_GOLD]=
-   _state->ref_frame_data[OC_FRAME_PREV]=
-   _state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
-   _state->ref_frame_data[OC_FRAME_PREV_ORIG]=
-   _state->ref_frame_data[OC_FRAME_SELF]=
-   _state->ref_frame_data[OC_FRAME_IO]=NULL;
   return 0;
 }
 
 static void oc_state_ref_bufs_clear(oc_theora_state *_state){
   _ogg_free(_state->frag_buf_offs);
-  oc_aligned_free(_state->ref_frame_handle);
+  _ogg_free(_state->ref_frame_data[0]);
 }
 
 
@@ -707,8 +690,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
      how it is specified in the bitstream, because the Y axis is flipped in
      the bitstream.
     The displayable frame must fit inside the encoded frame.
-    The color space must be one known by the encoder.
-    The framerate ratio must not contain a zero value.*/
+    The color space must be one known by the encoder.*/
   if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
    _info->frame_width<=0||_info->frame_width>=0x100000||
    _info->frame_height<=0||_info->frame_height>=0x100000||
@@ -721,8 +703,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
       but there are a number of compilers which will mis-optimize this.
      It's better to live with the spurious warnings.*/
    _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
-   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS||
-   _info->fps_numerator<1||_info->fps_denominator<1){
+   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
     return TH_EINVAL;
   }
   memset(_state,0,sizeof(*_state));
@@ -961,7 +942,7 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            refi;
+  int            mb_mode;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -980,14 +961,18 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  refi=_state->frags[_fragi].refi;
+  mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
-  if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2(_state,

+ 4 - 12
love/src/jni/libtheora-1.2.0alpha1/lib/state.h

@@ -173,8 +173,6 @@ typedef struct oc_theora_state          oc_theora_state;
 # define OC_FRAME_PREV      (1)
 /*The current frame.*/
 # define OC_FRAME_SELF      (2)
-/*Used to mark uncoded fragments (for DC prediction).*/
-# define OC_FRAME_NONE      (3)
 
 /*The input or output buffer.*/
 # define OC_FRAME_IO        (3)
@@ -304,9 +302,7 @@ struct oc_fragment{
     There are no fragments outside the coded frame by construction.*/
   unsigned   invalid:1;
   /*The index of the quality index used for this fragment's AC coefficients.*/
-  unsigned   qii:4;
-  /*The index of the reference frame this fragment is predicted from.*/
-  unsigned   refi:2;
+  unsigned   qii:6;
   /*The mode of the macroblock this fragment belongs to.*/
   unsigned   mb_mode:3;
   /*The index of the associated border information for fragments which lie
@@ -427,16 +423,12 @@ struct oc_theora_state{
   ptrdiff_t           ncoded_fragis[3];
   /*The total number of coded fragments.*/
   ptrdiff_t           ntotal_coded_fragis;
-  /*The actual buffers used for the reference frames.*/
-  th_ycbcr_buffer     ref_frame_bufs[6];
   /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
   int                 ref_frame_idx[6];
-  /*The storage for the reference frame buffers.
-    This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
-     for faster look-up.*/
+  /*The actual buffers used for the reference frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[6];
+  /*The storage for the reference frame buffers.*/
   unsigned char      *ref_frame_data[6];
-  /*The handle used to allocate the reference frame buffers.*/
-  unsigned char      *ref_frame_handle;
   /*The strides for each plane in the reference frames.*/
   int                 ref_ystride[3];
   /*The number of unique border patterns.*/

+ 40 - 31
love/src/jni/libtheora-1.2.0alpha1/lib/tokenize.c

@@ -454,10 +454,9 @@ struct oc_quant_token{
 
 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
    dequantizes and de-zig-zags the result.
-  The AC coefficients of _idct must be pre-initialized to zero.*/
+  The DC coefficient is not preserved; it should be restored by the caller.*/
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_idct,const ogg_int16_t *_qdct,
- const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
   oc_token_checkpoint *stack;
   ogg_int64_t          zflags;
@@ -502,7 +501,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
     qc=_qdct[zzi];
     s=-(qc<0);
     qc_m=qc+s^s;
-    c=_dct[zzi];
+    c=_dct[OC_FZIG_ZAG[zzi]];
     /*The hard case: try a zero run.*/
     if(qc_m<=1){
       ogg_uint32_t sum_d2;
@@ -566,7 +565,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
               /*Try a +/- 1 combo token.*/
               token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
               eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-val_s];
-              e=_dct[zzj]-(_dequant[zzj]+val_s^val_s);
+              e=_dct[OC_FZIG_ZAG[zzj]]-(_dequant[zzj]+val_s^val_s);
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               bits=oc_token_bits(_enc,huffi,zzi,token);
               cost=d2+_lambda*bits+tokens[zzk][tk].cost;
@@ -586,7 +585,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
               bits=oc_token_bits(_enc,huffi,zzi,token);
               val=2+(val>2);
               sval=val+val_s^val_s;
-              e=_dct[zzj]-_dequant[zzj]*sval;
+              e=_dct[OC_FZIG_ZAG[zzj]]-_dequant[zzj]*sval;
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               cost=d2+_lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
@@ -702,6 +701,9 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
+  /*We blow away the first entry here so that things vectorize better.
+    The DC coefficient is not actually stored in the array yet.*/
+  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   zzi=1;
   ti=best_flags>>1&1;
@@ -735,7 +737,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
     zzj=(next>>1)-1&63;
     /*TODO: It may be worth saving the dequantized coefficient in the trellis
        above; we had to compute it to measure the error anyway.*/
-    _idct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
   }
@@ -745,15 +747,16 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
 }
 
 /*Simplistic R/D tokenizer.
-  The AC coefficients of _idct must be pre-initialized to zero.
   This could be made more accurate by using more sophisticated
    rate predictions for zeros.
   It could be made faster by switching from R/D decisions to static
    lambda-derived rounding biases.*/
 int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_idct,const ogg_int16_t *_qdct,
- const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
+  /*Note that gcc will not always respect this alignment.
+    In this case it doesn't matter terribly much.*/
+  OC_ALIGN16(ogg_int16_t  coef[64]);
   const unsigned char *dct_fzig_zag;
   ogg_uint16_t        *eob_run;
   oc_token_checkpoint *stack;
@@ -776,7 +779,9 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
   eob_run=_enc->eob_run[_pli];
   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
-  for(zzj=zzi=1;zzj<_zzi&&!_qdct[zzj];zzj++);
+  memcpy(coef,_qdct,_zzi*sizeof(*coef));
+  for(zzj=0;zzj<64;zzj++)_qdct[zzj]=0;
+  for(zzj=zzi=1;zzj<_zzi&&!coef[zzj];zzj++);
   while(zzj<_zzi){
     int v;
     int d0;
@@ -792,10 +797,10 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
     int eob_bits;
     int dct_fzig_zzj;
     dct_fzig_zzj=dct_fzig_zag[zzj];
-    v=_dct[zzj];
-    d0=_qdct[zzj];
+    v=_dct[OC_FZIG_ZAG[zzj]];
+    d0=coef[zzj];
     eob=eob_run[zzi];
-    for(zzk=zzj+1;zzk<_zzi&&!_qdct[zzk];zzk++);
+    for(zzk=zzj+1;zzk<_zzi&&!coef[zzk];zzk++);
     next_zero=zzk-zzj+62>>6;
     dq0=d0*_dequant[zzj];
     dd0=dq0-v;
@@ -835,7 +840,7 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
         cost=dd1+zr[next_zero];
       }
       if((dd0+(best_bits+eob_bits)*_lambda)>cost){
-        _idct[dct_fzig_zzj]=dq1;
+        _qdct[dct_fzig_zzj]=dq1;
         if(d1==0){
           zzj=zzk;
           continue;
@@ -846,7 +851,7 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
       }
       else{
         best_eb=*(OC_DCT_VALUE_EB_PTR+d0);
-        _idct[dct_fzig_zzj]=dq0;
+        _qdct[dct_fzig_zzj]=dq0;
       }
       oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
       if(eob>0){
@@ -922,6 +927,7 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
       }
       best_cost=dd0+(best_bits+eob_bits)*_lambda;
       if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){
+        _qdct[dct_fzig_zzj]=0;
         zzj=zzk;
         continue;
       }
@@ -930,9 +936,9 @@ int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
         best_token=best_token1;
         best_eb=best_eb1;
         d=d1;
-        _idct[dct_fzig_zzj]=dq1;
+        _qdct[dct_fzig_zzj]=dq1;
       }
-      else _idct[dct_fzig_zzj]=dq0;
+      else _qdct[dct_fzig_zzj]=dq0;
       oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
       if(eob){
         oc_enc_eob_log(_enc,_pli,zzi,eob);
@@ -997,10 +1003,10 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
          predictor for the same reference frame.*/
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         if(frags[fragi].coded){
-          int refi;
-          refi=frags[fragi].refi;
-          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[refi]);
-          pred_last[refi]=frags[fragi].dc;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
+          pred_last[ref]=frags[fragi].dc;
         }
       }
     }
@@ -1012,24 +1018,27 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
       u_frags=frags-nhfrags;
       l_ref=-1;
       ul_ref=-1;
-      u_ref=u_frags[fragi].refi;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
         int ur_ref;
         if(fragx+1>=nhfrags)ur_ref=-1;
-        else ur_ref=u_frags[fragi+1].refi;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
         if(frags[fragi].coded){
           int pred;
-          int refi;
-          refi=frags[fragi].refi;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
           /*We break out a separate case based on which of our neighbors use
              the same reference frames.
             This is somewhat faster than trying to make a generic case which
              handles all of them, since it reduces lots of poorly predicted
              jumps to one switch statement, and also lets a number of the
              multiplications be optimized out by strength reduction.*/
-          switch((l_ref==refi)|(ul_ref==refi)<<1|
-           (u_ref==refi)<<2|(ur_ref==refi)<<3){
-            default:pred=pred_last[refi];break;
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
             case  1:
             case  3:pred=frags[fragi-1].dc;break;
             case  2:pred=u_frags[fragi-1].dc;break;
@@ -1063,8 +1072,8 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
             }break;
           }
           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
-          pred_last[refi]=frags[fragi].dc;
-          l_ref=refi;
+          pred_last[ref]=frags[fragi].dc;
+          l_ref=ref;
         }
         else l_ref=-1;
         ul_ref=u_ref;

+ 904 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxencfrag.c

@@ -0,0 +1,904 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ystride3;
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    /*Load the first 4 rows of each block.*/
+    "movq (%[src]),%%mm0\n\t"
+    "movq (%[ref]),%%mm1\n\t"
+    "movq (%[src],%[ystride]),%%mm2\n\t"
+    "movq (%[ref],%[ystride]),%%mm3\n\t"
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    "movq (%[src],%[ystride],2),%%mm4\n\t"
+    "movq (%[ref],%[ystride],2),%%mm5\n\t"
+    "movq (%[src],%[ystride3]),%%mm6\n\t"
+    "movq (%[ref],%[ystride3]),%%mm7\n\t"
+    /*Compute their SADs and add them in %%mm0*/
+    "psadbw %%mm1,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "lea (%[ref],%[ystride],4),%[ref]\n\t"
+    /*Load the next 3 rows as registers become available.*/
+    "movq (%[src]),%%mm2\n\t"
+    "movq (%[ref]),%%mm3\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq (%[ref],%[ystride]),%%mm5\n\t"
+    "movq (%[src],%[ystride]),%%mm4\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "movq (%[ref],%[ystride],2),%%mm7\n\t"
+    "movq (%[src],%[ystride],2),%%mm6\n\t"
+    /*Start adding their SADs to %%mm0*/
+    "psadbw %%mm3,%%mm2\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    /*Load last row as registers become available.*/
+    "movq (%[src],%[ystride3]),%%mm2\n\t"
+    "movq (%[ref],%[ystride3]),%%mm3\n\t"
+    /*And finish adding up their SADs.*/
+    "paddw %%mm4,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "movd %%mm0,%[ret]\n\t"
+    :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
+    :[ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
+   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
+  We pre-load the next two rows of data as registers become available.*/
+#define OC_SAD2_LOOP \
+ "#OC_SAD2_LOOP\n\t" \
+ /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
+    pavgb computes (%%mm0+%%mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb. \
+   TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
+    schedules better; currently, however, this function is unused.*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
+ "pxor %%mm1,%%mm0\n\t" \
+ "pavgb %%mm1,%%mm6\n\t" \
+ "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm0\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "movq (%[ref2],%[ystride]),%%mm3\n\t" \
+ "psubb %%mm0,%%mm6\n\t" \
+ "movq (%[ref1]),%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm6,%%mm4\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movq (%[ref2]),%%mm1\n\t" \
+ "lea (%[src],%[ystride],2),%[src]\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "movq (%[ref1],%[ystride]),%%mm2\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "movq (%[src]),%%mm4\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movq (%[src],%[ystride]),%%mm5\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL \
+ "#OC_SAD2_TAIL\n\t" \
+ "movq %%mm0,%%mm6\n\t" \
+ "pavgb %%mm1,%%mm0\n\t" \
+ "pxor %%mm1,%%mm6\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm6\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "psubb %%mm6,%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm0,%%mm4\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    "movq (%[ref1]),%%mm0\n\t"
+    "movq (%[ref2]),%%mm1\n\t"
+    "movq (%[ref1],%[ystride]),%%mm2\n\t"
+    "movq (%[ref2],%[ystride]),%%mm3\n\t"
+    "xor %[ret],%[ret]\n\t"
+    "movq (%[src]),%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src],%[ystride]),%%mm5\n\t"
+    "psubb %%mm6,%%mm7\n\t"
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
+    :[ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+   16-bit difference in %%mm0...%%mm7.*/
+#define OC_LOAD_SUB_8x4(_off) \
+ "#OC_LOAD_SUB_8x4\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm4\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movd "#_off"(%[src]),%%mm2\n\t" \
+ "movd "#_off"(%[ref]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
+ "punpcklbw %%mm4,%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%mm4,%%mm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movd "#_off"(%[src]),%%mm4\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
+ "punpcklbw %%mm5,%%mm1\n\t" \
+ "punpcklbw %%mm5,%%mm5\n\t" \
+ "psubw %%mm5,%%mm1\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm2\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psubw %%mm7,%%mm2\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
+ "punpcklbw %%mm6,%%mm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%mm6,%%mm6\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movd "#_off"(%[src]),%%mm6\n\t" \
+ "punpcklbw %%mm0,%%mm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%mm0,%%mm4\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
+ "punpcklbw %%mm7,%%mm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psubw %%mm7,%%mm5\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
+ "punpcklbw %%mm0,%%mm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%mm0,%%mm6\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],8),%[src]\n\t" \
+ "punpcklbw %%mm0,%%mm7\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
+ "psubw %%mm0,%%mm7\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) \
+ "#OC_LOAD_8x4\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
+ "pxor %%mm7,%%mm7\n\t" \
+ "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
+ "punpcklbw %%mm7,%%mm0\n\t" \
+ "movd "#_off"(%[src4]),%%mm4\n\t" \
+ "punpcklbw %%mm7,%%mm1\n\t" \
+ "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm2\n\t" \
+ "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm3\n\t" \
+ "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
+ "punpcklbw %%mm4,%%mm4\n\t" \
+ "punpcklbw %%mm5,%%mm5\n\t" \
+ "psrlw $8,%%mm4\n\t" \
+ "psrlw $8,%%mm5\n\t" \
+ "punpcklbw %%mm6,%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psrlw $8,%%mm6\n\t" \
+ "psrlw $8,%%mm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 \
+ "#OC_HADAMARD_AB_8x4\n\t" \
+ /*Stage A: \
+   Outputs 0-3 are swapped with 4-7 here.*/ \
+ "paddw %%mm1,%%mm5\n\t" \
+ "paddw %%mm2,%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "psubw %%mm5,%%mm1\n\t" \
+ "psubw %%mm6,%%mm2\n\t" \
+ "paddw %%mm3,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ "psubw %%mm7,%%mm3\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ /*Stage B:*/ \
+ "paddw %%mm2,%%mm0\n\t" \
+ "paddw %%mm3,%%mm1\n\t" \
+ "paddw %%mm6,%%mm4\n\t" \
+ "paddw %%mm7,%%mm5\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ "psubw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm3\n\t" \
+ "psubw %%mm4,%%mm6\n\t" \
+ "psubw %%mm5,%%mm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 \
+ "#OC_HADAMARD_C_8x4\n\t" \
+ /*Stage C:*/ \
+ "paddw %%mm1,%%mm0\n\t" \
+ "paddw %%mm3,%%mm2\n\t" \
+ "paddw %%mm5,%%mm4\n\t" \
+ "paddw %%mm7,%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ "psubw %%mm0,%%mm1\n\t" \
+ "psubw %%mm2,%%mm3\n\t" \
+ "psubw %%mm4,%%mm5\n\t" \
+ "psubw %%mm6,%%mm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 \
+ OC_HADAMARD_AB_8x4 \
+ OC_HADAMARD_C_8x4 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, %%mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+ /*We use the fact that \
+     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+    to merge the final butterfly with the abs and the first stage of \
+    accumulation. \
+   Thus we can avoid using pabsw, which is not available until SSSE3. \
+   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+    registers). \
+   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+   This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
+ "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
+ "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
+ /*mm7={0x7FFF}x4 \
+   mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "movq %%mm0,%%mm6\n\t" \
+ "psrlw $1,%%mm7\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "pmaxsw %%mm1,%%mm0\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "psubw %%mm6,%%mm0\n\t" \
+ /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+   mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm4,%%mm1\n\t" \
+ "pmaxsw %%mm3,%%mm2\n\t" \
+ "pmaxsw %%mm5,%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
+ "paddsw %%mm7,%%mm1\n\t" \
+ "psubw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm4\n\t" \
+ /*mm7={1}x4 (needed for the horizontal add that follows) \
+   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+ "movq %%mm3,%%mm6\n\t" \
+ "pmaxsw %%mm5,%%mm3\n\t" \
+ "paddw %%mm2,%%mm0\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "psrlw $14,%%mm7\n\t" \
+ "psubw %%mm6,%%mm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
+ OC_HADAMARD_AB_8x4 \
+ OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) \
+ "#OC_TRANSPOSE_4x4x2\n\t" \
+ /*First 4x4 transpose:*/ \
+ "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
+ /*mm0 = e3 e2 e1 e0 \
+   mm1 = f3 f2 f1 f0 \
+   mm2 = g3 g2 g1 g0 \
+   mm3 = h3 h2 h1 h0*/ \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm3,%%mm2\n\t" \
+ "punpckhwd %%mm3,%%mm5\n\t" \
+ "movq %%mm0,%%mm3\n\t" \
+ "punpcklwd %%mm1,%%mm0\n\t" \
+ "punpckhwd %%mm1,%%mm3\n\t" \
+ /*mm0 = f1 e1 f0 e0 \
+   mm3 = f3 e3 f2 e2 \
+   mm2 = h1 g1 h0 g0 \
+   mm5 = h3 g3 h2 g2*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm2,%%mm0\n\t" \
+ "punpckhdq %%mm2,%%mm1\n\t" \
+ "movq %%mm3,%%mm2\n\t" \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ /*mm0 = h0 g0 f0 e0 \
+   mm1 = h1 g1 f1 e1 \
+   mm2 = h2 g2 f2 e2 \
+   mm3 = h3 g3 f3 e3*/ \
+ "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
+ /*Second 4x4 transpose:*/ \
+ /*mm4 = a3 a2 a1 a0 \
+   mm5 = b3 b2 b1 b0 \
+   mm6 = c3 c2 c1 c0 \
+   mm7 = d3 d2 d1 d0*/ \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm7,%%mm6\n\t" \
+ "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
+ "punpckhwd %%mm7,%%mm0\n\t" \
+ "movq %%mm4,%%mm7\n\t" \
+ "punpcklwd %%mm5,%%mm4\n\t" \
+ "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
+ "punpckhwd %%mm5,%%mm7\n\t" \
+ /*mm4 = b1 a1 b0 a0 \
+   mm7 = b3 a3 b2 a2 \
+   mm6 = d1 c1 d0 c0 \
+   mm0 = d3 c3 d2 c2*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm7,%%mm6\n\t" \
+ "punpckhdq %%mm0,%%mm7\n\t" \
+ "punpckldq %%mm0,%%mm6\n\t" \
+ /*mm4 = d0 c0 b0 a0 \
+   mm5 = d1 c1 b1 a1 \
+   mm6 = d2 c2 b2 a2 \
+   mm7 = d3 c3 b3 a3*/ \
+
+static unsigned oc_int_frag_satd_mmxext(unsigned *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  unsigned dc;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_SUB_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    "pmaddwd %%mm7,%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    "pmaddwd %%mm7,%%mm0\n\t"
+    /*Compute abs(dc).*/
+    "movsx %w[dc],%[ret]\n\t"
+    "cdq\n\t"
+    "add %[ret2],%[ret2]\n\t"
+    "add %[dc],%[ret]\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "xor %[ret],%[dc]\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "sub %[dc],%[ret2]\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
+       and %[ret2] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+       constraints, otherewise if gcc can prove they're equal it will allocate
+       them to the same register (which is bad); _src and _ref face a similar
+       problem, though those are never actually the same.*/
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=d"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
+    :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+    /*We have to use neg, so we actually clobber the condition codes for once
+       (not to mention cmp, sub, and add).*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
+}
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_mmxext().*/
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm__ __volatile__(
+    /*Load the first 3 rows.*/
+    "movq (%[src1]),%%mm0\n\t"
+    "movq (%[src2]),%%mm1\n\t"
+    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq (%[src1]),%%mm4\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src2]),%%mm5\n\t"
+    /*mm7={1}x8.*/
+    "psubb %%mm6,%%mm7\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm0\n\t"
+    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 0) is done; write it out.*/
+    "movq %%mm6,(%[dst])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free, continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    /*%%mm2 (row 1) is done; write it out.*/
+    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1]),%%mm2\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    /*%%mm4 (row 2) is done; write it out.*/
+    "movq %%mm4,(%[dst])\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2]),%%mm3\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm4\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm5\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1]),%%mm0\n\t"
+    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 3) is done; write it out.*/
+    "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free; continue loading the next row.*/
+    "movq (%[src2]),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    /*%%mm2 (row 4) is done; write it out.*/
+    "movq %%mm2,(%[dst])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
+    /*%%mm4 (row 5) is done; write it out.*/
+    "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
+    "movq %%mm2,%%mm4\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm4\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm0,%%mm6\n\t"
+    "pand %%mm7,%%mm4\n\t"
+    /*%%mm6 (row 6) is done, write it out.*/
+    "movq %%mm6,(%[dst])\n\t"
+    "psubb %%mm4,%%mm2\n\t"
+    /*%%mm2 (row 7) is done, write it out.*/
+    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
+    :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
+    :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+     [src_ystride]"r"((ptrdiff_t)_src_ystride)
+    :"memory"
+  );
+}
+
+unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  unsigned dc;
+  __asm__ __volatile__(
+    OC_LOAD_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    "pmaddwd %%mm7,%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    "pmaddwd %%mm7,%%mm0\n\t"
+    /*We assume that the DC coefficient is always positive (which is true,
+       because the input to the INTRA transform was not a difference).*/
+    "movzx %w[dc],%[dc]\n\t"
+    "add %[ret],%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "lea -64(%[ret],%[ret2],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
+       and %[ret2] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf] (which is also
+       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
+    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+    /*We have to use sub, so we actually clobber the condition codes for once
+       (not to mention add).*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  int i;
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*mm0=[src]*/
+      "movq (%[src]),%%mm0\n\t"
+      /*mm1=[ref]*/
+      "movq (%[ref]),%%mm1\n\t"
+      /*mm4=[src+ystride]*/
+      "movq (%[src],%[ystride]),%%mm4\n\t"
+      /*mm5=[ref+ystride]*/
+      "movq (%[ref],%[ystride]),%%mm5\n\t"
+      /*Compute [src]-[ref].*/
+      "movq %%mm0,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm0\n\t"
+      "movq %%mm1,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm1\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm1,%%mm0\n\t"
+      "psubw %%mm3,%%mm2\n\t"
+      /*Compute [src+ystride]-[ref+ystride].*/
+      "movq %%mm4,%%mm1\n\t"
+      "punpcklbw %%mm7,%%mm4\n\t"
+      "movq %%mm5,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm1\n\t"
+      "lea (%[src],%[ystride],2),%[src]\n\t"
+      "punpcklbw %%mm7,%%mm5\n\t"
+      "lea (%[ref],%[ystride],2),%[ref]\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm5,%%mm4\n\t"
+      "psubw %%mm3,%%mm1\n\t"
+      /*Write the answer out.*/
+      "movq %%mm0,0x00(%[residue])\n\t"
+      "movq %%mm2,0x08(%[residue])\n\t"
+      "movq %%mm4,0x10(%[residue])\n\t"
+      "movq %%mm1,0x18(%[residue])\n\t"
+      "lea 0x20(%[residue]),%[residue]\n\t"
+      :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
+      :[ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+  ptrdiff_t ystride3;
+  __asm__ __volatile__(
+    /*mm0=[src]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*mm6={-1}x4*/
+    "pcmpeqw %%mm6,%%mm6\n\t"
+    /*mm2=[src+2*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*[ystride3]=3*[ystride]*/
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    /*mm6={1}x4*/
+    "psllw $15,%%mm6\n\t"
+    /*mm3=[src+3*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*mm6={128}x4*/
+    "psrlw $8,%%mm6\n\t"
+    /*mm7=0*/
+    "pxor %%mm7,%%mm7\n\t"
+    /*[src]=[src]+4*[ystride]*/
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    /*Compute [src]-128 and [src+ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x00(%[residue])\n\t"
+    "movq %%mm4,0x08(%[residue])\n\t"
+    "movq %%mm1,0x10(%[residue])\n\t"
+    "movq %%mm5,0x18(%[residue])\n\t"
+    /*mm0=[src+4*ystride]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+5*ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x20(%[residue])\n\t"
+    "movq %%mm4,0x28(%[residue])\n\t"
+    "movq %%mm3,0x30(%[residue])\n\t"
+    "movq %%mm5,0x38(%[residue])\n\t"
+    /*mm2=[src+6*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*mm3=[src+7*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x40(%[residue])\n\t"
+    "movq %%mm4,0x48(%[residue])\n\t"
+    "movq %%mm1,0x50(%[residue])\n\t"
+    "movq %%mm5,0x58(%[residue])\n\t"
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x60(%[residue])\n\t"
+    "movq %%mm4,0x68(%[residue])\n\t"
+    "movq %%mm3,0x70(%[residue])\n\t"
+    "movq %%mm5,0x78(%[residue])\n\t"
+    :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
+    :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
+    :"memory"
+  );
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif

+ 665 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxfdct.c

@@ -0,0 +1,665 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+# define OC_FDCT_STAGE1_8x4 \
+ "#OC_FDCT_STAGE1_8x4\n\t" \
+ /*Stage 1:*/ \
+ /*mm0=t7'=t0-t7*/ \
+ "psubw %%mm7,%%mm0\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*mm1=t6'=t1-t6*/ \
+ "psubw %%mm6,%%mm1\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm2=t5'=t2-t5*/ \
+ "psubw %%mm5,%%mm2\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm3=t4'=t3-t4*/ \
+ "psubw %%mm4,%%mm3\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm7=t0'=t0+t7*/ \
+ "paddw %%mm0,%%mm7\n\t" \
+ /*mm6=t1'=t1+t6*/ \
+ "paddw %%mm1,%%mm6\n\t" \
+ /*mm5=t2'=t2+t5*/ \
+ "paddw %%mm2,%%mm5\n\t" \
+ /*mm4=t3'=t3+t4*/ \
+ "paddw %%mm3,%%mm4\n\t" \
+
+# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_FDCT8x4\n\t" \
+ /*Stage 2:*/ \
+ /*mm7=t3''=t0'-t3'*/ \
+ "psubw %%mm4,%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm6=t2''=t1'-t2'*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "movq %%mm7,"_r6"(%[y])\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm1=t5''=t6'-t5'*/ \
+ "psubw %%mm2,%%mm1\n\t" \
+ "movq %%mm6,"_r2"(%[y])\n\t" \
+ /*mm4=t0''=t0'+t3'*/ \
+ "paddw %%mm7,%%mm4\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ /*mm5=t1''=t1'+t2'*/ \
+ "movq %%mm4,"_r0"(%[y])\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*mm2=t6''=t6'+t5'*/ \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq %%mm5,"_r4"(%[y])\n\t" \
+ /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+ /*mm4, mm5, mm6, mm7 are free.*/ \
+ /*Stage 3:*/ \
+ /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "pcmpeqb %%mm6,%%mm6\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrlw $15,%%mm6\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm0=0, m2={-1}x4 \
+   mm5:mm4=t5''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm2,%%mm2\n\t" \
+ /*mm2=t6'', mm1=t5''+(t5''!=0) \
+   mm4=(t5''*27146+0xB500>>16)*/ \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm2,%%mm0\n\t" \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm0,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm0\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ /*mm3=t4''=t4'+s*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*mm1=t5'''=t4'-s*/ \
+ "psubw %%mm4,%%mm1\n\t" \
+ /*mm1=0, mm3={-1}x4 \
+   mm5:mm4=t6''*27146+0xB500*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm3,"_r1"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ /*mm1=t1'' \
+   mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "paddw %%mm2,%%mm4\n\t" \
+ "movq "_r4"(%[y]),%%mm1\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm0=t7''=t7'+s*/ \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm2=t6'''=t7'-s*/ \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*Stage 4:*/ \
+ /*mm0=0, mm2=t0'' \
+   mm5:mm4=t1''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq "_r0"(%[y]),%%mm2\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ /*mm7={27146,0x4000>>1}x2 \
+   mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm0\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm6={0x00000E3D}x2 \
+   mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm1\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "movq %%mm0,%%mm7\n\t" \
+ "pxor %%mm4,%%mm0\n\t" \
+ "pand %%mm4,%%mm7\n\t" \
+ "psraw $1,%%mm0\n\t" \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "paddw %%mm7,%%mm0\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm4=_y[4]=v=r-u*/ \
+ "psubw %%mm0,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "movq %%mm4,"_r4"(%[y])\n\t" \
+ /*mm0=0, mm7={36410}x4 \
+   mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r0"(%[y])\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm0=0 \
+   mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "paddw %%mm2,%%mm1\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t7'', mm7={26568,0x3400}x2 \
+   mm2=s=t6'''-(36410*u>>16)*/ \
+ "movq %%mm4,%%mm1\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*mm6={0x00007B1B}x2 \
+   mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={64277-0x7FFF,0x7FFF}x2 \
+   mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "psrad $17,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $17,%%mm5\n\t" \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={12785}x4 \
+   mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r1"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t3'', mm7={20539,0x3000}x2 \
+   mm4=s=(12785*u>>16)-t4''*/ \
+ "movq %%mm4,"_r1"(%[y])\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "movq "_r6"(%[y]),%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm6={0x00006CB7}x2 \
+   mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={60547-0x7FFF,0x7FFF}x2 \
+   mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "psrad $20,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $20,%%mm5\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={25080}x4 \
+   mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r7"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r2"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm1={-1}x4 \
+   mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm1,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+   mm4=s=(25080*u>>16)-t2''*/ \
+ "movq %%mm4,%%mm6\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "pxor %%mm5,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ /*mm2=s+(s!=0) \
+   mm4:mm3=s*21600+0x2800*/ \
+ "movq %%mm4,%%mm3\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpckhwd %%mm5,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "psubw %%mm1,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm3\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm3\n\t" \
+ /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+   mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "movq "_r4"(%[y]),%%mm0\n\t" \
+ "psrad $18,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm5\n\t" \
+ "psrad $18,%%mm3\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "packssdw %%mm4,%%mm3\n\t" \
+ "movq "_r0"(%[y]),%%mm4\n\t" \
+ "paddw %%mm2,%%mm3\n\t" \
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_TRANSPOSE8x4\n\t" \
+ /*First 4x4 transpose:*/ \
+ /*mm0 = e3 e2 e1 e0 \
+   mm5 = f3 f2 f1 f0 \
+   mm3 = g3 g2 g1 g0 \
+   mm1 = h3 h2 h1 h0*/ \
+ "movq %%mm0,%%mm2\n\t" \
+ "punpcklwd %%mm5,%%mm0\n\t" \
+ "punpckhwd %%mm5,%%mm2\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm3\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ /*mm0 = f1 e1 f0 e0 \
+   mm2 = f3 e3 f2 e2 \
+   mm3 = h1 g1 h0 g0 \
+   mm5 = h3 g3 h2 g2*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm3,%%mm0\n\t" \
+ "movq %%mm0,"_r4"(%[y])\n\t" \
+ "punpckhdq %%mm3,%%mm1\n\t" \
+ "movq "_r1"(%[y]),%%mm0\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ "movq "_r3"(%[y]),%%mm5\n\t" \
+ /*_y[4] = h0 g0 f0 e0 \
+    mm1  = h1 g1 f1 e1 \
+    mm2  = h2 g2 f2 e2 \
+    mm3  = h3 g3 f3 e3*/ \
+ /*Second 4x4 transpose:*/ \
+ /*mm4 = a3 a2 a1 a0 \
+   mm0 = b3 b2 b1 b0 \
+   mm6 = c3 c2 c1 c0 \
+   mm5 = d3 d2 d1 d0*/ \
+ "movq %%mm4,%%mm7\n\t" \
+ "punpcklwd %%mm0,%%mm4\n\t" \
+ "punpckhwd %%mm0,%%mm7\n\t" \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm6\n\t" \
+ "punpckhwd %%mm5,%%mm0\n\t" \
+ /*mm4 = b1 a1 b0 a0 \
+   mm7 = b3 a3 b2 a2 \
+   mm6 = d1 c1 d0 c0 \
+   mm0 = d3 c3 d2 c2*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm7,%%mm6\n\t" \
+ "punpckhdq %%mm0,%%mm7\n\t" \
+ "punpckldq %%mm0,%%mm6\n\t" \
+ /*mm4 = d0 c0 b0 a0 \
+   mm5 = d1 c1 b1 a1 \
+   mm6 = d2 c2 b2 a2 \
+   mm7 = d3 c3 b3 a3*/ \
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    "movq 0x00(%[x]),%%mm0\n\t"
+    "movq 0x10(%[x]),%%mm1\n\t"
+    "movq 0x20(%[x]),%%mm2\n\t"
+    "movq 0x30(%[x]),%%mm3\n\t"
+    "pcmpeqb %%mm4,%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq %%mm0,%%mm5\n\t"
+    "psllw $2,%%mm0\n\t"
+    "pcmpeqw %%mm7,%%mm5\n\t"
+    "movq 0x70(%[x]),%%mm7\n\t"
+    "psllw $2,%%mm1\n\t"
+    "psubw %%mm4,%%mm5\n\t"
+    "psllw $2,%%mm2\n\t"
+    "mov $1,%[a]\n\t"
+    "pslld $16,%%mm5\n\t"
+    "movd %[a],%%mm6\n\t"
+    "psllq $16,%%mm5\n\t"
+    "mov $0x10001,%[a]\n\t"
+    "psllw $2,%%mm3\n\t"
+    "movd %[a],%%mm4\n\t"
+    "punpckhwd %%mm6,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "movq 0x60(%[x]),%%mm6\n\t"
+    "paddw %%mm5,%%mm0\n\t"
+    "movq 0x50(%[x]),%%mm5\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq 0x40(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psllw $2,%%mm7\n\t"
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm6\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psllw $2,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm4\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    /*Swap out this 8x4 block for the next one.*/
+    "movq 0x08(%[x]),%%mm0\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[x]),%%mm7\n\t"
+    "movq %%mm1,0x50(%[y])\n\t"
+    "movq 0x18(%[x]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[x]),%%mm6\n\t"
+    "movq %%mm2,0x60(%[y])\n\t"
+    "movq 0x28(%[x]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[x]),%%mm5\n\t"
+    "movq %%mm3,0x70(%[y])\n\t"
+    "movq 0x38(%[x]),%%mm3\n\t"
+    /*And increase its working precision, too.*/
+    "psllw $2,%%mm0\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "psllw $2,%%mm7\n\t"
+    "movq 0x48(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm1\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    "psllw $2,%%mm6\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm2\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    "psllw $2,%%mm5\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $2,%%mm3\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    "psllw $2,%%mm4\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    "movq 0x00(%[y]),%%mm0\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "movq 0x10(%[y]),%%mm1\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "movq 0x20(%[y]),%%mm2\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "movq 0x30(%[y]),%%mm3\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x18(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x08(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq 0x40(%[y]),%%mm0\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[y]),%%mm7\n\t"
+    "movq %%mm1,0x08(%[y])\n\t"
+    "movq 0x50(%[y]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[y]),%%mm6\n\t"
+    "movq %%mm2,0x28(%[y])\n\t"
+    "movq 0x60(%[y]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[y]),%%mm5\n\t"
+    "movq %%mm3,0x38(%[y])\n\t"
+    "movq 0x70(%[y]),%%mm3\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "movq 0x48(%[y]),%%mm4\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x48(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "movq %%mm4,0x40(%[y])\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "movq %%mm5,0x50(%[y])\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq %%mm6,0x60(%[y])\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x70(%[y])\n\t"
+    "movq %%mm1,0x48(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+
+#endif

+ 368 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxfrag.c

@@ -0,0 +1,368 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+  Originally written by Rudolf Marek.
+  Additional optimization by Nils Pipenbrinck.
+  Note: Loops are unrolled for best performance.
+  The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+}
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+  __asm__ __volatile__(
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    /*#0 Load low residue.*/
+    "movq 0*8(%[residue]),%%mm1\n\t"
+    /*#0 Load high residue.*/
+    "movq 1*8(%[residue]),%%mm2\n\t"
+    /*Set mm0 to 0x8000800080008000.*/
+    "psllw $15,%%mm0\n\t"
+    /*#1 Load low residue.*/
+    "movq 2*8(%[residue]),%%mm3\n\t"
+    /*#1 Load high residue.*/
+    "movq 3*8(%[residue]),%%mm4\n\t"
+    /*Set mm0 to 0x0080008000800080.*/
+    "psrlw $8,%%mm0\n\t"
+    /*#2 Load low residue.*/
+    "movq 4*8(%[residue]),%%mm5\n\t"
+    /*#2 Load high residue.*/
+    "movq 5*8(%[residue]),%%mm6\n\t"
+    /*#0 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#0 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#0 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#1 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#1 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#1 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#2 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#2 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#2 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#0 Write row.*/
+    "movq %%mm1,(%[dst])\n\t"
+    /*#1 Write row.*/
+    "movq %%mm3,(%[dst],%[ystride])\n\t"
+    /*#2 Write row.*/
+    "movq %%mm5,(%[dst],%[ystride],2)\n\t"
+    /*#3 Load low residue.*/
+    "movq 6*8(%[residue]),%%mm1\n\t"
+    /*#3 Load high residue.*/
+    "movq 7*8(%[residue]),%%mm2\n\t"
+    /*#4 Load high residue.*/
+    "movq 8*8(%[residue]),%%mm3\n\t"
+    /*#4 Load high residue.*/
+    "movq 9*8(%[residue]),%%mm4\n\t"
+    /*#5 Load high residue.*/
+    "movq 10*8(%[residue]),%%mm5\n\t"
+    /*#5 Load high residue.*/
+    "movq 11*8(%[residue]),%%mm6\n\t"
+    /*#3 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#3 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#3 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#4 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#4 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#4 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#5 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#5 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#5 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#3 Write row.*/
+    "movq %%mm1,(%[dst],%[ystride3])\n\t"
+    /*#4 Write row.*/
+    "movq %%mm3,(%[dst4])\n\t"
+    /*#5 Write row.*/
+    "movq %%mm5,(%[dst4],%[ystride])\n\t"
+    /*#6 Load low residue.*/
+    "movq 12*8(%[residue]),%%mm1\n\t"
+    /*#6 Load high residue.*/
+    "movq 13*8(%[residue]),%%mm2\n\t"
+    /*#7 Load low residue.*/
+    "movq 14*8(%[residue]),%%mm3\n\t"
+    /*#7 Load high residue.*/
+    "movq 15*8(%[residue]),%%mm4\n\t"
+    /*#6 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#6 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#6 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#7 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#7 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#7 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#6 Write row.*/
+    "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
+    /*#7 Write row.*/
+    "movq %%mm3,(%[dst4],%[ystride3])\n\t"
+    :
+    :[residue]"r"(_residue),
+     [dst]"r"(_dst),
+     [dst4]"r"(_dst+(_ystride<<2)),
+     [ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+    :"memory"
+  );
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm0.*/
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*#0 Load source.*/
+      "movq (%[src]),%%mm3\n\t"
+      /*#1 Load source.*/
+      "movq (%[src],%[ystride]),%%mm7\n\t"
+      /*#0 Get copy of src.*/
+      "movq %%mm3,%%mm4\n\t"
+      /*#0 Expand high source.*/
+      "punpckhbw %%mm0,%%mm4\n\t"
+      /*#0 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm3\n\t"
+      /*#0 Add residue high.*/
+      "paddsw 8(%[residue]),%%mm4\n\t"
+      /*#1 Get copy of src.*/
+      "movq %%mm7,%%mm2\n\t"
+      /*#0 Add residue low.*/
+      "paddsw (%[residue]), %%mm3\n\t"
+      /*#1 Expand high source.*/
+      "punpckhbw %%mm0,%%mm2\n\t"
+      /*#0 Pack final row pixels.*/
+      "packuswb %%mm4,%%mm3\n\t"
+      /*#1 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm7\n\t"
+      /*#1 Add residue low.*/
+      "paddsw 16(%[residue]),%%mm7\n\t"
+      /*#1 Add residue high.*/
+      "paddsw 24(%[residue]),%%mm2\n\t"
+      /*Advance residue.*/
+      "lea 32(%[residue]),%[residue]\n\t"
+      /*#1 Pack final row pixels.*/
+      "packuswb %%mm2,%%mm7\n\t"
+      /*Advance src.*/
+      "lea (%[src],%[ystride],2),%[src]\n\t"
+      /*#0 Write row.*/
+      "movq %%mm3,(%[dst])\n\t"
+      /*#1 Write row.*/
+      "movq %%mm7,(%[dst],%[ystride])\n\t"
+      /*Advance dst.*/
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
+      :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
+      :[ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm7.*/
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*#0 Load src1.*/
+      "movq (%[src1]),%%mm0\n\t"
+      /*#0 Load src2.*/
+      "movq (%[src2]),%%mm2\n\t"
+      /*#0 Copy src1.*/
+      "movq %%mm0,%%mm1\n\t"
+      /*#0 Copy src2.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*#1 Load src1.*/
+      "movq (%[src1],%[ystride]),%%mm4\n\t"
+      /*#0 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm0\n\t"
+      /*#1 Load src2.*/
+      "movq (%[src2],%[ystride]),%%mm5\n\t"
+      /*#0 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm1\n\t"
+      /*#0 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm2\n\t"
+      /*#0 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*Advance src1 ptr.*/
+      "lea (%[src1],%[ystride],2),%[src1]\n\t"
+      /*Advance src2 ptr.*/
+      "lea (%[src2],%[ystride],2),%[src2]\n\t"
+      /*#0 Lower src1+src2.*/
+      "paddsw %%mm2,%%mm0\n\t"
+      /*#0 Higher src1+src2.*/
+      "paddsw %%mm3,%%mm1\n\t"
+      /*#1 Copy src1.*/
+      "movq %%mm4,%%mm2\n\t"
+      /*#0 Build lo average.*/
+      "psraw $1,%%mm0\n\t"
+      /*#1 Copy src2.*/
+      "movq %%mm5,%%mm3\n\t"
+      /*#1 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm4\n\t"
+      /*#0 Build hi average.*/
+      "psraw $1,%%mm1\n\t"
+      /*#1 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm2\n\t"
+      /*#0 low+=residue.*/
+      "paddsw (%[residue]),%%mm0\n\t"
+      /*#1 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm5\n\t"
+      /*#0 high+=residue.*/
+      "paddsw 8(%[residue]),%%mm1\n\t"
+      /*#1 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*#1 Lower src1+src2.*/
+      "paddsw %%mm4,%%mm5\n\t"
+      /*#0 Pack and saturate.*/
+      "packuswb %%mm1,%%mm0\n\t"
+      /*#1 Higher src1+src2.*/
+      "paddsw %%mm2,%%mm3\n\t"
+      /*#0 Write row.*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*#1 Build lo average.*/
+      "psraw $1,%%mm5\n\t"
+      /*#1 Build hi average.*/
+      "psraw $1,%%mm3\n\t"
+      /*#1 low+=residue.*/
+      "paddsw 16(%[residue]),%%mm5\n\t"
+      /*#1 high+=residue.*/
+      "paddsw 24(%[residue]),%%mm3\n\t"
+      /*#1 Pack and saturate.*/
+      "packuswb  %%mm3,%%mm5\n\t"
+      /*#1 Write row ptr.*/
+      "movq %%mm5,(%[dst],%[ystride])\n\t"
+      /*Advance residue ptr.*/
+      "add $32,%[residue]\n\t"
+      /*Advance dest ptr.*/
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
+     :[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+%r"(_src1),[src2]"+r"(_src2)
+     :[ystride]"r"((ptrdiff_t)_ystride)
+     :"memory"
+    );
+  }
+}
+
+void oc_restore_fpu_mmx(void){
+  __asm__ __volatile__("emms\n\t");
+}
+#endif

+ 562 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxidct.c

@@ -0,0 +1,562 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of Theora's iDCT.
+  Originally written by Rudolf Marek, based on code from On2's VP3.*/
+#include "x86int.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*These are offsets into the table of constants below.*/
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (0)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET  (56)
+
+
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN(_y,_x) \
+  "#OC_IDCT_BEGIN\n\t" \
+  "movq "OC_I(3,_x)",%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "movq "OC_J(5,_x)",%%mm7\n\t" \
+  "pmulhw %%mm6,%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm6\n\t" \
+  "movq %%mm1,%%mm5\n\t" \
+  "pmulhw %%mm2,%%mm1\n\t" \
+  "movq "OC_I(1,_x)",%%mm3\n\t" \
+  "pmulhw %%mm7,%%mm5\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
+  "paddw %%mm2,%%mm4\n\t" \
+  "paddw %%mm7,%%mm6\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "movq "OC_J(7,_x)",%%mm1\n\t" \
+  "paddw %%mm5,%%mm7\n\t" \
+  "movq %%mm0,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm0\n\t" \
+  "paddw %%mm7,%%mm4\n\t" \
+  "pmulhw %%mm1,%%mm5\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm3,%%mm0\n\t" \
+  "pmulhw %%mm7,%%mm3\n\t" \
+  "movq "OC_I(2,_x)",%%mm2\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "paddw %%mm1,%%mm5\n\t" \
+  "movq %%mm2,%%mm1\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
+  "psubw %%mm5,%%mm3\n\t" \
+  "movq "OC_J(6,_x)",%%mm5\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "movq %%mm5,%%mm7\n\t" \
+  "psubw %%mm4,%%mm0\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psubw %%mm6,%%mm3\n\t" \
+  "paddw %%mm7,%%mm5\n\t" \
+  "paddw %%mm6,%%mm6\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm6\n\t" \
+  "movq %%mm4,"OC_I(1,_y)"\n\t" \
+  "psubw %%mm5,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+  "movq %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm4,%%mm3\n\t" \
+  "paddw %%mm2,%%mm7\n\t" \
+  "movq %%mm6,"OC_I(2,_y)"\n\t" \
+  "movq %%mm0,%%mm2\n\t" \
+  "movq "OC_I(0,_x)",%%mm6\n\t" \
+  "pmulhw %%mm4,%%mm0\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "movq "OC_J(4,_x)",%%mm3\n\t" \
+  "psubw %%mm1,%%mm5\n\t" \
+  "paddw %%mm0,%%mm2\n\t" \
+  "psubw %%mm3,%%mm6\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "pmulhw %%mm4,%%mm6\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm0,%%mm3\n\t" \
+  "paddw %%mm5,%%mm1\n\t" \
+  "pmulhw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm6\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "movq "OC_I(1,_y)",%%mm0\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "paddw %%mm3,%%mm4\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "#end OC_IDCT_BEGIN\n\t" \
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT(_y,_x) \
+  "#OC_ROW_IDCT\n" \
+  OC_IDCT_BEGIN(_y,_x) \
+  /*r3=D'*/ \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*Save R1.*/ \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
+  /*r0=R0=G.+C.*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  "#end OC_ROW_IDCT\n\t" \
+
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
+
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
+
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
+
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
+
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
+
+  Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE(_y) \
+  "#OC_TRANSPOSE\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "punpcklwd %%mm5,%%mm4\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
+  "punpckhwd %%mm5,%%mm1\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "punpcklwd %%mm7,%%mm6\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "punpckldq %%mm6,%%mm4\n\t" \
+  "punpckhdq %%mm6,%%mm5\n\t" \
+  "movq %%mm1,%%mm6\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
+  "punpckhwd %%mm7,%%mm0\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
+  "punpckhdq %%mm0,%%mm6\n\t" \
+  "movq "OC_I(0,_y)",%%mm4\n\t" \
+  "punpckldq %%mm0,%%mm1\n\t" \
+  "movq "OC_I(1,_y)",%%mm5\n\t" \
+  "movq %%mm4,%%mm0\n\t" \
+  "movq %%mm6,"OC_J(7,_y)"\n\t" \
+  "punpcklwd %%mm5,%%mm0\n\t" \
+  "movq %%mm1,"OC_J(6,_y)"\n\t" \
+  "punpckhwd %%mm5,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklwd %%mm3,%%mm2\n\t" \
+  "movq %%mm0,%%mm1\n\t" \
+  "punpckldq %%mm2,%%mm0\n\t" \
+  "punpckhdq %%mm2,%%mm1\n\t" \
+  "movq %%mm4,%%mm2\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
+  "punpckhwd %%mm3,%%mm5\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
+  "punpckhdq %%mm5,%%mm4\n\t" \
+  "punpckldq %%mm5,%%mm2\n\t" \
+  "movq %%mm4,"OC_I(3,_y)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
+  "#end OC_TRANSPOSE\n\t" \
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT(_y) \
+  "#OC_COLUMN_IDCT\n" \
+  OC_IDCT_BEGIN(_y,_y) \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r2=NR2*/ \
+  "psraw $4,%%mm2\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=NR1*/ \
+  "psraw $4,%%mm1\n\t" \
+  /*r3=D'*/ \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*Store NR2 at I(2).*/ \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*Store NR1 at I(1).*/ \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
+  /*r3=D'+D'*/ \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r4=NR4*/ \
+  "psraw $4,%%mm4\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*r3=NR3*/ \
+  "psraw $4,%%mm3\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
+  /*r5=B''+B''*/ \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r6=NR6*/ \
+  "psraw $4,%%mm6\n\t" \
+  /*Store NR4 at J(4).*/ \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
+  /*r5=NR5*/ \
+  "psraw $4,%%mm5\n\t" \
+  /*Store NR3 at I(3).*/ \
+  "movq %%mm3,"OC_I(3,_y)"\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
+  /*r0=C'+C'*/ \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*r0=R0=G'+C'*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  /*r7=NR7*/ \
+  "psraw $4,%%mm7\n\t" \
+  /*Store NR6 at J(6).*/ \
+  "movq %%mm6,"OC_J(6,_y)"\n\t" \
+  /*r0=NR0*/ \
+  "psraw $4,%%mm0\n\t" \
+  /*Store NR5 at J(5).*/ \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
+  /*Store NR7 at J(7).*/ \
+  "movq %%mm7,"OC_J(7,_y)"\n\t" \
+  /*Store NR0 at I(0).*/ \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
+  "#end OC_COLUMN_IDCT\n\t" \
+
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  /*This routine accepts an 8x8 matrix, but in partially transposed form.
+    Every 4x4 block is transposed.*/
+  __asm__ __volatile__(
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
+#undef  OC_I
+#undef  OC_J
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+  );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+        :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+      );
+    }
+  }
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10(_y,_x) \
+ "#OC_IDCT_BEGIN_10\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
+ "nop\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "movq %%mm5,%%mm1\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "nop\n\t" \
+ "#end OC_IDCT_BEGIN_10\n\t" \
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10(_y,_x) \
+ "#OC_ROW_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10(_y,_x) \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ "#end OC_ROW_IDCT_10\n\t" \
+
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10(_y) \
+ "#OC_COLUMN_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
+ "#end OC_COLUMN_IDCT_10\n\t" \
+
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  __asm__ __volatile__(
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10(y,x)
+    OC_TRANSPOSE(y)
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
+#undef  OC_I
+#undef  OC_J
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+  );
+  if(_x!=_y){
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+    );
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+  else oc_idct8x8_slow_mmx(_y,_x);
+}
+
+#endif

+ 318 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxloop.h

@@ -0,0 +1,318 @@
+#if !defined(_x86_mmxloop_H)
+# define _x86_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX \
+  "#OC_LOOP_FILTER8_MMX\n\t" \
+  /*mm7=0*/ \
+  "pxor %%mm7,%%mm7\n\t" \
+  /*mm6:mm0={a0,...,a7}*/ \
+  "movq %%mm0,%%mm6\n\t" \
+  "punpcklbw %%mm7,%%mm0\n\t" \
+  "punpckhbw %%mm7,%%mm6\n\t" \
+  /*mm3:mm5={d0,...,d7}*/ \
+  "movq %%mm3,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm3\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  "psubw %%mm3,%%mm0\n\t" \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*mm3:mm1={b0,...,b7}*/ \
+  "movq %%mm1,%%mm3\n\t" \
+  "punpcklbw %%mm7,%%mm1\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm3\n\t" \
+  /*mm5:mm4={c0,...,c7}*/ \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  "pcmpeqw %%mm7,%%mm7\n\t" \
+  "psubw %%mm1,%%mm4\n\t" \
+  "psrlw $14,%%mm7\n\t" \
+  "psubw %%mm3,%%mm5\n\t" \
+  /*Scale by 3.*/ \
+  "pmullw %%mm7,%%mm4\n\t" \
+  "pmullw %%mm7,%%mm5\n\t" \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  "psrlw $1,%%mm7\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psllw $2,%%mm7\n\t" \
+  "movq (%[ll]),%%mm0\n\t" \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm7,%%mm5\n\t" \
+  "psraw $3,%%mm4\n\t" \
+  "psraw $3,%%mm5\n\t" \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "packsswb %%mm5,%%mm4\n\t" \
+  "pxor %%mm6,%%mm6\n\t" \
+  "pxor %%mm7,%%mm4\n\t" \
+  "packuswb %%mm3,%%mm1\n\t" \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  "pcmpgtb %%mm4,%%mm6\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "pxor %%mm6,%%mm4\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "psubb %%mm6,%%mm4\n\t" \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  "paddusb %%mm4,%%mm7\n\t" \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  "paddusb %%mm7,%%mm4\n\t" \
+  "psubusb %%mm7,%%mm4\n\t" \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  "movq %%mm4,%%mm5\n\t" \
+  "pand %%mm6,%%mm4\n\t" \
+  "pandn %%mm5,%%mm6\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm4,%%mm1\n\t" \
+  "psubusb %%mm4,%%mm2\n\t" \
+  "psubusb %%mm6,%%mm1\n\t" \
+  "paddusb %%mm6,%%mm2\n\t" \
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+  All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+  /*This first part is based on the transformation \
+      f = -(3*(c-b)+a-d+4>>3) \
+        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+        = -(3*(c+~b)+(a+~d)-1016>>3) \
+        = 127-(3*(c+~b)+(a+~d)>>3) \
+        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+    Using this, the last expression above can be computed in 8 bits of working \
+     precision via: \
+      u = ~pavgb(~b,c); \
+      v = pavgb(b,~c); \
+      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+      m = u-v; \
+      t = m^pavgb(m^~a,m^d); \
+      f = 128+pavgb(pavgb(t,u),v); \
+    This required some careful analysis to ensure that carries are propagated \
+     correctly in all cases, but has been checked exhaustively.*/ \
+  /*input (a, b, c, d, ., ., ., .)*/ \
+  /*ff=0xFF; \
+    u=b; \
+    v=c; \
+    ll=255-2*L;*/ \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "movq %%mm1,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "movq (%[ll]),%%mm6\n\t" \
+  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u^=ff; \
+    v^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm5\n\t" \
+  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u=pavgb(u,c); \
+    v=pavgb(v,b);*/ \
+  "pavgb %%mm2,%%mm4\n\t" \
+  "pavgb %%mm1,%%mm5\n\t" \
+  /*u^=ff; \
+    a^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm0\n\t" \
+  /*m=u-v;*/ \
+  "psubb %%mm5,%%mm4\n\t" \
+  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+  /*a^=m; \
+    d^=m;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "pxor %%mm4,%%mm3\n\t" \
+  /*t=pavgb(a,d);*/ \
+  "pavgb %%mm3,%%mm0\n\t" \
+  "psllw $7,%%mm7\n\t" \
+  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+  /*t^=m; \
+    u=m+v;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "paddb %%mm5,%%mm4\n\t" \
+  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+  /*f=pavgb(f,u); \
+    of=128;*/ \
+  "pavgb %%mm4,%%mm0\n\t" \
+  "packsswb %%mm7,%%mm7\n\t" \
+  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+  /*f=pavgb(f,v);*/ \
+  "pavgb %%mm5,%%mm0\n\t" \
+  "movq %%mm7,%%mm3\n\t" \
+  "movq %%mm6,%%mm4\n\t" \
+  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but staying in 8 bits gives much better parallelism).*/ \
+  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+    This is the same number of instructions as computing a mask and splitting \
+     after the lflim computation, but has shorter dependency chains.*/ \
+  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+  "psubusb %%mm0,%%mm3\n\t" \
+  "psubusb %%mm7,%%mm0\n\t" \
+  /*mm6=255-max(2*L-abs(R_i<0),0) \
+    mm4=255-max(2*L-abs(R_i>0),0)*/ \
+  "paddusb %%mm3,%%mm4\n\t" \
+  "paddusb %%mm0,%%mm6\n\t" \
+  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+  "paddusb %%mm4,%%mm3\n\t" \
+  "paddusb %%mm6,%%mm0\n\t" \
+  "psubusb %%mm4,%%mm3\n\t" \
+  "psubusb %%mm6,%%mm0\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm3,%%mm1\n\t" \
+  "psubusb %%mm3,%%mm2\n\t" \
+  "psubusb %%mm0,%%mm1\n\t" \
+  "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
+  do{ \
+    ptrdiff_t ystride3__; \
+    __asm__ __volatile__( \
+      /*mm0={a0,...,a7}*/ \
+      "movq (%[pix]),%%mm0\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*mm3={d0,...,d7}*/ \
+      "movq (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*mm1={b0,...,b7}*/ \
+      "movq (%[pix],%[ystride]),%%mm1\n\t" \
+      /*mm2={c0,...,c7}*/ \
+      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
+      _filter \
+      /*Write it back out.*/ \
+      "movq %%mm1,(%[pix],%[ystride])\n\t" \
+      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
+      :[ystride3]"=&r"(ystride3__) \
+      :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
+       [ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
+  do{ \
+    unsigned char *pix__; \
+    ptrdiff_t      ystride3__; \
+    ptrdiff_t      d__; \
+    pix__=(_pix)-2; \
+    __asm__ __volatile__( \
+      /*x x x x d0 c0 b0 a0*/ \
+      "movd (%[pix]),%%mm0\n\t" \
+      /*x x x x d1 c1 b1 a1*/ \
+      "movd (%[pix],%[ystride]),%%mm1\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*x x x x d2 c2 b2 a2*/ \
+      "movd (%[pix],%[ystride],2),%%mm2\n\t" \
+      /*x x x x d3 c3 b3 a3*/ \
+      "lea (%[pix],%[ystride],4),%[d]\n\t" \
+      "movd (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*x x x x d4 c4 b4 a4*/ \
+      "movd (%[d]),%%mm4\n\t" \
+      /*x x x x d5 c5 b5 a5*/ \
+      "movd (%[d],%[ystride]),%%mm5\n\t" \
+      /*x x x x d6 c6 b6 a6*/ \
+      "movd (%[d],%[ystride],2),%%mm6\n\t" \
+      /*x x x x d7 c7 b7 a7*/ \
+      "movd (%[d],%[ystride3]),%%mm7\n\t" \
+      /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "punpcklbw %%mm1,%%mm0\n\t" \
+      /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+      "punpcklbw %%mm3,%%mm2\n\t" \
+      /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "movq %%mm0,%%mm3\n\t" \
+      /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "punpcklwd %%mm2,%%mm0\n\t" \
+      /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "punpckhwd %%mm2,%%mm3\n\t" \
+      /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "movq %%mm0,%%mm1\n\t" \
+      /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "punpcklbw %%mm5,%%mm4\n\t" \
+      /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+      "punpcklbw %%mm7,%%mm6\n\t" \
+      /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "movq %%mm4,%%mm5\n\t" \
+      /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+      "punpcklwd %%mm6,%%mm4\n\t" \
+      /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+      "punpckhwd %%mm6,%%mm5\n\t" \
+      /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "movq %%mm3,%%mm2\n\t" \
+      /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+      "punpckldq %%mm4,%%mm0\n\t" \
+      /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+      "punpckhdq %%mm4,%%mm1\n\t" \
+      /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+      "punpckldq %%mm5,%%mm2\n\t" \
+      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+      "punpckhdq %%mm5,%%mm3\n\t" \
+      _filter \
+      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+      "movq %%mm1,%%mm0\n\t" \
+      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+      "punpcklbw %%mm2,%%mm1\n\t" \
+      /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+      "punpckhbw %%mm2,%%mm0\n\t" \
+      /*[d]=c1 b1 c0 b0*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm1\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c3 b3 c2 b2*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      "lea (%[pix],%[ystride],4),%[pix]\n\t" \
+      /*[d]=c5 b5 c4 b4*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm0\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c7 b7 c6 b6*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif

+ 228 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/mmxstate.c

@@ -0,0 +1,228 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of complete fragment reconstruction algorithm.
+  Originally written by Rudolf Marek.*/
+#include <string.h>
+#include "x86int.h"
+#include "mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    int          i;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm__ __volatile__(
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpckldq %%mm0,%%mm0\n\t"
+      :
+      :[p]"r"((unsigned)p)
+    );
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+      );
+    }
+  }
+  else{
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+/*We copy these entire function to inline the actual MMX routines so that we
+   use only a single indirect call.*/
+
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,_flimit,8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  OC_ALIGN8(unsigned char   ll[8]);
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+#endif

+ 498 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2encfrag.c

@@ -0,0 +1,498 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+#include "sse2trans.h"
+
+#if defined(OC_X86_ASM)
+
+/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
+   16-bit differences.
+  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
+  xmm4 and xmm5 are clobbered.*/
+#define OC_LOAD_SUB_4x8(_m0) \
+ "#OC_LOAD_SUB_4x8\n\t" \
+ /*Load the first three rows.*/ \
+ "movq (%[src]),"_m0"\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
+ /*Unpack and subtract.*/ \
+ "punpcklbw %%xmm4,"_m0"\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm4,"_m0"\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+ /*Load the last row.*/ \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
+ /*Unpack, subtract, and advance the pointers.*/ \
+ "punpcklbw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ystride],4),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm3\n\t" \
+
+/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
+  On output, xmm0 contains the sum of two of the rows, and the other two are
+   added to xmm7.*/
+#define OC_SSD_4x8(_m0) \
+ "pmaddwd "_m0","_m0"\n\t" \
+ "pmaddwd %%xmm1,%%xmm1\n\t" \
+ "pmaddwd %%xmm2,%%xmm2\n\t" \
+ "pmaddwd %%xmm3,%%xmm3\n\t" \
+ "paddd %%xmm1,"_m0"\n\t" \
+ "paddd %%xmm3,%%xmm2\n\t" \
+ "paddd %%xmm2,%%xmm7\n\t" \
+
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_4x8("%%xmm7")
+    OC_SSD_4x8("%%xmm7")
+    OC_LOAD_SUB_4x8("%%xmm0")
+    OC_SSD_4x8("%%xmm0")
+    "paddd %%xmm0,%%xmm7\n\t"
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+  );
+  return ret;
+}
+
+static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+};
+
+/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
+   horizontal sums as well as their 16-bit differences subject to a mask.
+  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+#define OC_LOAD_SUB_MASK_2x8 \
+ "#OC_LOAD_SUB_MASK_2x8\n\t" \
+ /*Start the loads and expand the next 8 bits of the mask.*/ \
+ "shl $8,%[m]\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "movq (%[ref]),%%xmm2\n\t" \
+ "movd %[m],%%xmm4\n\t" \
+ "shr $8,%[m]\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ /*Perform the masking.*/ \
+ "pand %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm2\n\t" \
+ /*Finish the loads while unpacking the first set of rows, and expand the next
+    8 bits of the mask.*/ \
+ "movd %[m],%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm0\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm2\n\t" \
+ /*Mask and unpack the second set of rows.*/ \
+ "pand %%xmm4,%%xmm1\n\t" \
+ "pand %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  ptrdiff_t ystride;
+  unsigned  ret;
+  int       i;
+  ystride=_ystride;
+  __asm__ __volatile__(
+    "pxor %%xmm7,%%xmm7\n\t"
+    "movq %[c],%%xmm6\n\t"
+    :
+    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+  );
+  for(i=0;i<4;i++){
+    unsigned m;
+    m=_mask&0xFFFF;
+    _mask>>=16;
+    if(m){
+      __asm__ __volatile__(
+        OC_LOAD_SUB_MASK_2x8
+        "pmaddwd %%xmm0,%%xmm0\n\t"
+        "pmaddwd %%xmm1,%%xmm1\n\t"
+        "paddd %%xmm0,%%xmm7\n\t"
+        "paddd %%xmm1,%%xmm7\n\t"
+        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+      );
+    }
+    _src+=2*ystride;
+    _ref+=2*ystride;
+  }
+  __asm__ __volatile__(
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+  );
+  return ret;
+}
+
+
+/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
+   16-bit difference in %%xmm0...%%xmm7.*/
+#define OC_LOAD_SUB_8x8 \
+ "#OC_LOAD_SUB_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movq (%[src]),%%xmm2\n\t" \
+ "movq (%[ref]),%%xmm7\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
+ "punpcklbw %%xmm4,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm0\n\t" \
+ "movq (%[src]),%%xmm4\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm5,%%xmm1\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm5,%%xmm1\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm2\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm6,%%xmm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm3\n\t" \
+ "movq (%[src]),%%xmm6\n\t" \
+ "punpcklbw %%xmm0,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm7,%%xmm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm5\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%xmm0,%%xmm6\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
+ "punpcklbw %%xmm0,%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+
+/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
+#define OC_LOAD_8x8 \
+ "#OC_LOAD_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "pxor %%xmm7,%%xmm7\n\t" \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "punpcklbw %%xmm7,%%xmm0\n\t" \
+ "movq (%[src4]),%%xmm4\n\t" \
+ "punpcklbw %%xmm7,%%xmm1\n\t" \
+ "movq (%[src4],%[ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm3\n\t" \
+ "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psrlw $8,%%xmm4\n\t" \
+ "psrlw $8,%%xmm5\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psrlw $8,%%xmm6\n\t" \
+ "psrlw $8,%%xmm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x8 \
+ "#OC_HADAMARD_AB_8x8\n\t" \
+ /*Stage A:*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm1,%%xmm5\n\t" \
+ "psubw %%xmm2,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm3\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "psubw %%xmm3,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*Stage B:*/ \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm5\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm6\n\t" \
+ "psubw %%xmm5,%%xmm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x8 \
+ "#OC_HADAMARD_C_8x8\n\t" \
+ /*Stage C:*/ \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm1\n\t" \
+ "psubw %%xmm2,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm5\n\t" \
+ "psubw %%xmm6,%%xmm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_8x8 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, %%xmm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ /*We use the fact that \
+     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+    to merge the final butterfly with the abs and the first stage of \
+    accumulation. \
+   Thus we can avoid using pabsw, which is not available until SSSE3. \
+   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
+    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+    registers). \
+   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+   This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm7={0x7FFF}x4 \
+   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
+ "pcmpeqb %%xmm7,%%xmm7\n\t" \
+ "movdqa %%xmm4,%%xmm6\n\t" \
+ "psrlw $1,%%xmm7\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm4\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm4\n\t" \
+ /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
+   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
+ "movdqa %%xmm2,%%xmm6\n\t" \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ "pmaxsw %%xmm3,%%xmm2\n\t" \
+ "pmaxsw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddsw %%xmm7,%%xmm1\n\t" \
+ "psubw %%xmm6,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm0\n\t" \
+ /*xmm7={1}x4 (needed for the horizontal add that follows) \
+   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm3,%%xmm0\n\t" \
+ "psrlw $14,%%xmm7\n\t" \
+ "psubw %%xmm6,%%xmm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into xmm0.*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into xmm0.
+  Note that xmm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_8x8
+
+static unsigned oc_int_frag_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  unsigned dc;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
+       latency of pmaddwd by computing abs(dc) here.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    "movsx %w[dc],%[ret]\n\t"
+    "cdq\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "add %[dc],%[ret]\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshufd $1,%%xmm0,%%xmm1\n\t"
+    "xor %[ret],%[dc]\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    "lea -64(%[ret],%[ret]),%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+       constraints, otherewise if gcc can prove they're equal it will allocate
+       them to the same register (which is bad); _src and _ref face a similar
+       problem.
+      All four are destructively modified, but if we list them as output
+       constraints, gcc can't alias them with other outputs.*/
+    :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+    /*We have to use neg, so we actually clobber the condition codes for once
+       (not to mention sub, and add).*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
+}
+
+unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
+}
+
+unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  unsigned dc;
+  __asm__ __volatile__(
+    OC_LOAD_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    /*We assume that the DC coefficient is always positive (which is true,
+       because the input to the INTRA transform was not a difference).*/
+    "movzx %w[dc],%[dc]\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshufd $1,%%xmm0,%%xmm1\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    "lea -64(%[ret],%[ret]),%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+    /*We have to use sub, so we actually clobber the condition codes for once.*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+#endif

+ 449 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2fdct.c

@@ -0,0 +1,449 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*SSE2 fDCT implementation for x86_64.*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include <stddef.h>
+#include "x86enc.h"
+#include "sse2trans.h"
+
+#if defined(OC_X86_64_ASM)
+
+# define OC_FDCT_8x8 \
+ /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
+ "#OC_FDCT_8x8\n\t" \
+ /*Stage 1:*/ \
+ "movdqa %%xmm0,%%xmm11\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "movdqa %%xmm2,%%xmm9\n\t" \
+ "movdqa %%xmm3,%%xmm8\n\t" \
+ /*xmm11=t7'=t0-t7*/ \
+ "psubw %%xmm7,%%xmm11\n\t" \
+ /*xmm10=t6'=t1-t6*/ \
+ "psubw %%xmm6,%%xmm10\n\t" \
+ /*xmm9=t5'=t2-t5*/ \
+ "psubw %%xmm5,%%xmm9\n\t" \
+ /*xmm8=t4'=t3-t4*/ \
+ "psubw %%xmm4,%%xmm8\n\t" \
+ /*xmm0=t0'=t0+t7*/ \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ /*xmm1=t1'=t1+t6*/ \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ /*xmm5=t2'=t2+t5*/ \
+ "paddw %%xmm2,%%xmm5\n\t" \
+ /*xmm4=t3'=t3+t4*/ \
+ "paddw %%xmm3,%%xmm4\n\t" \
+ /*xmm2,3,6,7 are now free.*/ \
+ /*Stage 2:*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm2=t2''=t1'-t2'*/ \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ /*xmm3=t3''=t0'-t3'*/ \
+ "psubw %%xmm4,%%xmm3\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ /*xmm10=t5''=t6'-t5'*/ \
+ "psubw %%xmm9,%%xmm10\n\t" \
+ "paddw %%xmm12,%%xmm12\n\t" \
+ /*xmm4=t0''=t0'+t3'*/ \
+ "paddw %%xmm0,%%xmm4\n\t" \
+ /*xmm1=t1''=t1'+t2'*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ /*xmm6=t6''=t6'+t5'*/ \
+ "paddw %%xmm9,%%xmm6\n\t" \
+ /*xmm0,xmm5,xmm9 are now free.*/ \
+ /*Stage 3:*/ \
+ /*xmm10:xmm5=t5''*27146+0xB500 \
+   xmm0=t5''*/ \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "movdqa %%xmm10,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm5\n\t" \
+ "pmaddwd %%xmm13,%%xmm5\n\t" \
+ /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
+ "psrad $16,%%xmm10\n\t" \
+ "psrad $16,%%xmm5\n\t" \
+ "packssdw %%xmm10,%%xmm5\n\t" \
+ "paddw %%xmm0,%%xmm5\n\t" \
+ /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm0\n\t" \
+ "psubw %%xmm14,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm0\n\t" \
+ "movdqa %%xmm8,%%xmm5\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ /*xmm5=t5'''=t4'-s*/ \
+ "psubw %%xmm0,%%xmm5\n\t" \
+ /*xmm8=t4''=t4'+s*/ \
+ "paddw %%xmm0,%%xmm8\n\t" \
+ /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
+ /*xmm7:xmm9=t6''*27146+0xB500*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm6,%%xmm9\n\t" \
+ "punpckhwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpcklwd %%xmm12,%%xmm9\n\t" \
+ "pmaddwd %%xmm13,%%xmm9\n\t" \
+ /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
+ "psrad $16,%%xmm7\n\t" \
+ "psrad $16,%%xmm9\n\t" \
+ "packssdw %%xmm7,%%xmm9\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm6\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ "movdqa %%xmm11,%%xmm7\n\t" \
+ "psraw $1,%%xmm9\n\t" \
+ /*xmm7=t6'''=t7'-s*/ \
+ "psubw %%xmm9,%%xmm7\n\t" \
+ /*xmm9=t7''=t7'+s*/ \
+ "paddw %%xmm11,%%xmm9\n\t" \
+ /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
+ /*Stage 4:*/ \
+ /*xmm10:xmm0=t1''*27146+0xB500*/ \
+ "movdqa %%xmm1,%%xmm0\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm0\n\t" \
+ "pmaddwd %%xmm13,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
+ "psrad $16,%%xmm0\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm10:xmm4=t0''*27146+0x4000*/ \
+ "movdqa %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm4,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm4\n\t" \
+ "pmaddwd %%xmm13,%%xmm4\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
+ "psrad $16,%%xmm4\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm4\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm0=_y[0]=u=r+s>>1 \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
+ "movdqa %%xmm0,%%xmm6\n\t" \
+ "pxor %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm6\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm6,%%xmm0\n\t" \
+ /*xmm4=_y[4]=v=r-u*/ \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
+ /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
+ "movdqa %%xmm3,%%xmm10\n\t" \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "punpcklwd %%xmm3,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%xmm3,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm6\n\t" \
+ /*xmm1:xmm2=25080*t2'' \
+   xmm12=t2''*/ \
+ "movdqa %%xmm2,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm12\n\t" \
+ "pmullw %%xmm13,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm1\n\t" \
+ "punpcklwd %%xmm11,%%xmm2\n\t" \
+ "punpckhwd %%xmm11,%%xmm1\n\t" \
+ /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%xmm2,%%xmm10\n\t" \
+ "paddd %%xmm1,%%xmm6\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm3\n\t" \
+ "psrad $16,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm3\n\t" \
+ "packssdw %%xmm6,%%xmm10\n\t" \
+ "paddw %%xmm3,%%xmm10\n\t" \
+ /*xmm2=_y[2]=u \
+   xmm10=s=(25080*u>>16)-t2''*/ \
+ "movdqa %%xmm10,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "psubw %%xmm12,%%xmm10\n\t" \
+ /*xmm1:xmm6=s*21600+0x2800*/ \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "punpcklwd %%xmm12,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm1\n\t" \
+ "pmaddwd %%xmm13,%%xmm1\n\t" \
+ /*xmm6=(s*21600+0x2800>>18)+s*/ \
+ "psrad $18,%%xmm6\n\t" \
+ "psrad $18,%%xmm1\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm1,%%xmm6\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t" \
+ /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t " \
+ /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
+ "movdqa %%xmm5,%%xmm10\n\t" \
+ "movdqa %%xmm5,%%xmm11\n\t" \
+ "punpcklwd %%xmm5,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "punpckhwd %%xmm5,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm7:xmm12=36410*t6''' \
+   xmm1=t6'''*/ \
+ "movdqa %%xmm7,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ "pmulhw %%xmm13,%%xmm3\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpckhwd %%xmm3,%%xmm7\n\t" \
+ "punpcklwd %%xmm3,%%xmm12\n\t" \
+ /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "paddd %%xmm7,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm5\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm5\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ /*xmm5=_y[5]=u \
+   xmm1=s=t6'''-(36410*u>>16)*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm10,%%xmm1\n\t" \
+ /*xmm11:xmm3=s*26568+0x3400*/ \
+ "movdqa %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm1,%%xmm11\n\t" \
+ "punpcklwd %%xmm12,%%xmm3\n\t" \
+ "pmaddwd %%xmm13,%%xmm3\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ /*xmm3=(s*26568+0x3400>>17)+s*/ \
+ "psrad $17,%%xmm3\n\t" \
+ "psrad $17,%%xmm11\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm11,%%xmm3\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t " \
+ /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
+ "movdqa %%xmm9,%%xmm10\n\t" \
+ "movdqa %%xmm9,%%xmm11\n\t" \
+ "punpcklwd %%xmm9,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%xmm9,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm12:xmm7=12785*t4''*/ \
+ "movdqa %%xmm8,%%xmm7\n\t" \
+ "movdqa %%xmm8,%%xmm1\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "pmulhw %%xmm13,%%xmm1\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpcklwd %%xmm1,%%xmm7\n\t" \
+ "punpckhwd %%xmm1,%%xmm12\n\t" \
+ /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%xmm7,%%xmm10\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm9\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm9\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm9,%%xmm10\n\t" \
+ /*xmm1=_y[1]=u \
+   xmm10=s=(12785*u>>16)-t4''*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm8,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm8:xmm7=s*20539+0x3000*/ \
+ "movdqa %%xmm10,%%xmm7\n\t" \
+ "movdqa %%xmm10,%%xmm8\n\t" \
+ "punpcklwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpckhwd %%xmm12,%%xmm8\n\t" \
+ "pmaddwd %%xmm13,%%xmm8\n\t" \
+ /*xmm7=(s*20539+0x3000>>20)+s*/ \
+ "psrad $20,%%xmm7\n\t" \
+ "psrad $20,%%xmm8\n\t" \
+ "packssdw %%xmm8,%%xmm7\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t" \
+ /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t " \
+
+/*SSE2 implementation of the fDCT for x86-64 only.
+  Because of the 8 extra XMM registers on x86-64, this version can operate
+   without any temporary stack access at all.*/
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Load the input.*/
+    "movdqa 0x00(%[x]),%%xmm0\n\t"
+    "movdqa 0x10(%[x]),%%xmm1\n\t"
+    "movdqa 0x20(%[x]),%%xmm2\n\t"
+    "movdqa 0x30(%[x]),%%xmm3\n\t"
+    "movdqa 0x40(%[x]),%%xmm4\n\t"
+    "movdqa 0x50(%[x]),%%xmm5\n\t"
+    "movdqa 0x60(%[x]),%%xmm6\n\t"
+    "movdqa 0x70(%[x]),%%xmm7\n\t"
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add a few biases to correct for some systematic error that
+       remains in the full fDCT->iDCT round trip.*/
+    /*xmm15={0}x8*/
+    "pxor %%xmm15,%%xmm15\n\t"
+    /*xmm14={-1}x8*/
+    "pcmpeqb %%xmm14,%%xmm14\n\t"
+    "psllw $2,%%xmm0\n\t"
+    /*xmm8=xmm0*/
+    "movdqa %%xmm0,%%xmm8\n\t"
+    "psllw $2,%%xmm1\n\t"
+    /*xmm8={_x[7...0]==0}*/
+    "pcmpeqw %%xmm15,%%xmm8\n\t"
+    "psllw $2,%%xmm2\n\t"
+    /*xmm8={_x[7...0]!=0}*/
+    "psubw %%xmm14,%%xmm8\n\t"
+    "psllw $2,%%xmm3\n\t"
+    /*%[a]=1*/
+    "mov $1,%[a]\n\t"
+    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pslld $16,%%xmm8\n\t"
+    "psllw $2,%%xmm4\n\t"
+    /*xmm9={0,0,0,0,0,0,0,1}*/
+    "movd %[a],%%xmm9\n\t"
+    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm5\n\t"
+    /*%[a]={1}x2*/
+    "mov $0x10001,%[a]\n\t"
+    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
+    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm6\n\t"
+    /*xmm10={0,0,0,0,0,0,1,1}*/
+    "movd %[a],%%xmm10\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
+    "paddw %%xmm8,%%xmm0\n\t"
+    "psllw $2,%%xmm7\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
+    "paddw %%xmm10,%%xmm0\n\t"
+    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
+    "psubw %%xmm9,%%xmm1\n\t"
+    /*Transform columns.*/
+    OC_FDCT_8x8
+    /*Transform rows.*/
+    OC_TRANSPOSE_8x8
+    OC_FDCT_8x8
+    /*TODO: zig-zag ordering?*/
+    OC_TRANSPOSE_8x8
+    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
+    "paddw %%xmm14,%%xmm14\n\t"
+    "psubw %%xmm14,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm1\n\t"
+    "psraw $2,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm2\n\t"
+    "psraw $2,%%xmm1\n\t"
+    "psubw %%xmm14,%%xmm3\n\t"
+    "psraw $2,%%xmm2\n\t"
+    "psubw %%xmm14,%%xmm4\n\t"
+    "psraw $2,%%xmm3\n\t"
+    "psubw %%xmm14,%%xmm5\n\t"
+    "psraw $2,%%xmm4\n\t"
+    "psubw %%xmm14,%%xmm6\n\t"
+    "psraw $2,%%xmm5\n\t"
+    "psubw %%xmm14,%%xmm7\n\t"
+    "psraw $2,%%xmm6\n\t"
+    "psraw $2,%%xmm7\n\t"
+    /*Store the result.*/
+    "movdqa %%xmm0,0x00(%[y])\n\t"
+    "movdqa %%xmm1,0x10(%[y])\n\t"
+    "movdqa %%xmm2,0x20(%[y])\n\t"
+    "movdqa %%xmm3,0x30(%[y])\n\t"
+    "movdqa %%xmm4,0x40(%[y])\n\t"
+    "movdqa %%xmm5,0x50(%[y])\n\t"
+    "movdqa %%xmm6,0x60(%[y])\n\t"
+    "movdqa %%xmm7,0x70(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+#endif

+ 460 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2idct.c

@@ -0,0 +1,460 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86enc.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*A table of constants used by the MMX routines.*/
+const short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+        8,      8,      8,      8,      8,      8,      8,      8,
+  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
+/*Performs the first three stages of the iDCT.
+  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+   (accessed in that order).
+  The remaining rows must be in _x at their corresponding locations.
+  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC(_x) \
+  "#OC_IDCT_8x8_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
+  "movdqa %%xmm1,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm1\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm6,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  "paddw %%xmm4,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+  "movdqa %%xmm4,%%xmm2\n\t" \
+  "movdqa %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm4\n\t" \
+  "pmulhw %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm6\n\t" \
+  "pmulhw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm6,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  "psubw %%xmm4,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+  "movdqa %%xmm3,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm5,%%xmm3\n\t" \
+  "pmulhw %%xmm5,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm7\n\t" \
+  "psubw %%xmm4,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+  "paddw %%xmm7,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm7\n\t" \
+  "psubw %%xmm6,%%xmm7\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm3\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "movdqa %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm5\n\t" \
+  "paddw %%xmm7,%%xmm5\n\t" \
+  "movdqa %%xmm0,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+  "#OC_IDCT_8x8_D\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+  "#OC_IDCT_8x8_D_STORE\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm4,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm5\n\t" \
+  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "psraw $4,%%xmm0\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
+  "psraw $4,%%xmm1\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
+  "psraw $4,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
+  "psraw $4,%%xmm3\n\t" \
+  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
+  "psraw $4,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "psraw $4,%%xmm5\n\t" \
+  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
+  "psraw $4,%%xmm6\n\t" \
+  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
+  "psraw $4,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+    OC_IDCT_8x8_ABC(x)
+    OC_IDCT_8x8_D
+    OC_TRANSPOSE_8x8
+    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+    OC_IDCT_8x8_ABC(y)
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+    /*Clear input data for next block (decoder only).*/
+    for(i=0;i<2;i++){
+      __asm__ __volatile__(
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+        :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+      );
+    }
+  }
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+   need to work with four columns at a time.
+  Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+  "#OC_IDCT_8x8_10_MMX\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
+  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+  "pmulhw %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "paddw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+  "pmulhw %%mm1,%%mm3\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+  "movq %%mm3,%%mm6\n\t" \
+  "paddw %%mm1,%%mm7\n\t" \
+  /*0-1 butterfly. \
+    mm4=C4, mm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+  "psubw %%mm5,%%mm3\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "pmulhw %%mm0,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "movq %%mm7,%%mm0\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "paddw %%mm2,%%mm0\n\t" \
+  "psubw %%mm2,%%mm7\n\t" \
+  "movq %%mm1,%%mm2\n\t" \
+  "pmulhw %%mm6,%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm2\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+  "paddw %%mm7,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "paddw %%mm4,%%mm7\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm6,%%mm5\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "psubw %%mm1,%%mm6\n\t" \
+  "psubw %%mm2,%%mm5\n\t" \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "paddw %%mm5,%%mm2\n\t" \
+  "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+  "#OC_IDCT_8x8_10_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
+  "paddw %%xmm6,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+  "pmulhw %%xmm3,%%xmm5\n\t" \
+  "pmulhw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
+  "paddw %%xmm3,%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+  "pmulhw %%xmm1,%%xmm3\n\t" \
+  "pmulhw %%xmm1,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=C4, xmm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+  "psubw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm1\n\t" \
+  "pmulhw %%xmm0,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm4\n\t" \
+  "movdqa %%xmm7,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
+    OC_IDCT_8x8_10_MMX
+    OC_TRANSPOSE_8x4_MMX2SSE
+    OC_IDCT_8x8_10_ABC
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  if(_x!=_y){
+    /*Clear input data for next block (decoder only).*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+    );
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+  else oc_idct8x8_slow_sse2(_y,_x);
+}
+
+#endif

+ 243 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/sse2trans.h

@@ -0,0 +1,243 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "../encint.h"
+# include "x86enc.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+  By clever choices of the order to apply the butterflies and the order of
+   their outputs, we can take the rows in order and output the columns in order
+   without any extra operations and using just one temporary register.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+  Again, the butterflies are carefully arranged to get the columns to come out
+   in order, minimizing register spills and maximizing the delay between a load
+   and when the value loaded is actually used.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
+
+# endif
+
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+   four SSE registers.
+  No need to be clever here; we have plenty of room.*/
+#  define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
+#endif

+ 182 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86cpu.c

@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# else
+/*On x86-32, not so much.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif

+ 36 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86cpu.h

@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

+ 61 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enc.c

@@ -0,0 +1,61 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
+  if(cpu_flags&OC_CPU_X86_MMX){
+    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
+  }
+  if(cpu_flags&OC_CPU_X86_MMXEXT){
+    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
+    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+  }
+  if(cpu_flags&OC_CPU_X86_SSE2){
+#  if defined(OC_X86_64_ASM)
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+#  endif
+    _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
+    _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
+    _enc->opt_vtable.quantize=oc_enc_quantize_sse2;
+# endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+# if defined(OC_ENC_USE_VTABLE)
+  }
+# endif
+}
+#endif

+ 114 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enc.h

@@ -0,0 +1,114 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86enc_H)
+# define _x86_x86enc_H (1)
+# include "x86int.h"
+
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
+  oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
+#   define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
+  oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  oc_enc_enquant_table_init_x86(_enquant,_dequant)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
+#  define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_enc_fdct8x8(_enc,_y,_x) \
+  oc_enc_fdct8x8_x86_64sse2(_y,_x)
+#  else
+#   define OC_ENC_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../encint.h"
+
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+# if defined(OC_X86_64_ASM)
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+# endif
+
+#endif

+ 257 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86enquant.c

@@ -0,0 +1,257 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+
+ ********************************************************************/
+
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+
+
+/*The default enquant table is not quite suitable for SIMD purposes.
+  First, the m and l parameters need to be separated so that an entire row full
+   of m's or l's can be loaded at a time.
+  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
+   emulate one with a multiply.
+  Therefore we translate the shift count into a scale factor.*/
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  ogg_int16_t *m;
+  ogg_int16_t *l;
+  int          zzi;
+  m=(ogg_int16_t *)_enquant;
+  l=m+64;
+  for(zzi=0;zzi<64;zzi++){
+    oc_iquant q;
+    oc_iquant_init(&q,_dequant[zzi]);
+    m[zzi]=q.m;
+    /*q.l must be at least 2 for this to work; fortunately, once all the scale
+       factors are baked in, the minimum quantizer is much larger than that.*/
+    l[zzi]=1<<16-q.l;
+  }
+}
+
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+  int pli;
+  int qii;
+  int qti;
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
+  }
+}
+
+/*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
+   and store them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+  ptrdiff_t r;
+  __asm__ __volatile__(
+    /*Put the input in zig-zag order.*/
+    OC_ZIG_ZAG_MMXEXT
+    "xor %[r],%[r]\n\t"
+    /*Loop through two rows at a time.*/
+    ".p2align 4\n\t"
+    "0:\n\t"
+    /*Load the first two rows of the data and the quant matrices.*/
+    "movdqa 0x00(%[qdct],%[r]),%%xmm0\n\t"
+    "movdqa 0x10(%[qdct],%[r]),%%xmm1\n\t"
+    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
+    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
+    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
+    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
+    /*Double the input and propagate its sign to the rounding factor.
+      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
+    "movdqa %%xmm0,%%xmm6\n\t"
+    "psraw $15,%%xmm0\n\t"
+    "movdqa %%xmm1,%%xmm7\n\t"
+    "paddw %%xmm6,%%xmm6\n\t"
+    "psraw $15,%%xmm1\n\t"
+    "paddw %%xmm7,%%xmm7\n\t"
+    "paddw %%xmm0,%%xmm2\n\t"
+    "paddw %%xmm1,%%xmm3\n\t"
+    "pxor %%xmm0,%%xmm2\n\t"
+    "pxor %%xmm1,%%xmm3\n\t"
+    /*Add the rounding factor and perform the first multiply.*/
+    "paddw %%xmm2,%%xmm6\n\t"
+    "paddw %%xmm3,%%xmm7\n\t"
+    "pmulhw %%xmm6,%%xmm4\n\t"
+    "pmulhw %%xmm7,%%xmm5\n\t"
+    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
+    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
+    "paddw %%xmm4,%%xmm6\n\t"
+    "paddw %%xmm5,%%xmm7\n\t"
+    /*Emulate an element-wise right-shift via a second multiply.*/
+    "pmulhw %%xmm2,%%xmm6\n\t"
+    "pmulhw %%xmm3,%%xmm7\n\t"
+    "add $32,%[r]\n\t"
+    "cmp $96,%[r]\n\t"
+    /*Correct for the sign.*/
+    "psubw %%xmm0,%%xmm6\n\t"
+    "psubw %%xmm1,%%xmm7\n\t"
+    /*Save the result.*/
+    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
+    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
+    "jle 0b\n\t"
+    /*Now find the location of the last non-zero value.*/
+    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pxor %%xmm0,%%xmm0\n\t"
+    "mov $-1,%k[dq]\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "mov $32,%[r]\n\t"
+    /*We have to use xor here instead of not in order to set the flags.*/
+    "xor %k[dq],%k[q]\n\t"
+    "jnz 1f\n\t"
+    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
+    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
+    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "xor %[r],%[r]\n\t"
+    "not %k[q]\n\t"
+    "or $1,%k[q]\n\t"
+    "1:\n\t"
+    "bsr %k[q],%k[q]\n\t"
+    "add %k[q],%k[r]\n\t"
+    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+    :[dct]"r"(_dct),[qdct]"r"(_qdct)
+    :"cc","memory"
+  );
+  return (int)r;
+}
+
+#endif

+ 124 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86int.h

@@ -0,0 +1,124 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_H)
+# define _x86_x86int_H (1)
+# include "../internal.h"
+
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_mmx(_dst,_src,_ystride)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
+#   define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_sse2(_y,_x,_last_zzi)
+#   define oc_state_frag_recon oc_state_frag_recon_mmx
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_mmxext(_bv,_flimit)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
+#   define oc_restore_fpu(_state) \
+  oc_restore_fpu_mmx()
+#  else
+#   define OC_STATE_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../state.h"
+# include "x86cpu.h"
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+  To avoid warnings, we force an offset with %H (which adds 8).*/
+# if defined(__GNUC_PREREQ)
+#  if __GNUC_PREREQ(4,0)
+#   define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs-8+%H[_name])
+#  endif
+# endif
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+  Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+   whole offset, instead of substituting in 0 for the missing operand to +.*/
+# if !defined(OC_MEM_OFFS)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs+%[_name])
+# endif
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
+    array_addr__; \
+  }))
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    const struct{_type array_value__[(_size)];} *array_addr__= \
+     (const void *)(_ptr); \
+    array_addr__; \
+  }))
+
+extern const short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
+void oc_state_accel_init_x86(oc_theora_state *_state);
+
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_mmx(void);
+
+#endif

+ 95 - 0
love/src/jni/libtheora-1.2.0alpha1/lib/x86/x86state.c

@@ -0,0 +1,95 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+void oc_state_accel_init_x86(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmxext;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_SSE2){
+    _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+# endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+# if defined(OC_STATE_USE_VTABLE)
+  }
+# endif
+}
+#endif

+ 5 - 4
love/src/jni/libvorbis-1.3.5/Android.mk

@@ -1,15 +1,14 @@
 LOCAL_PATH:= $(call my-dir)
 
-# libogg
+# libvorbis
 include $(CLEAR_VARS)
 
 LOCAL_MODULE    := libvorbis
 LOCAL_CFLAGS    := -fexceptions -g -Dlinux -DHAVE_GCC_DESTRUCTOR=1 -DOPT_GENERIC -DREAL_IS_FLOAT
 LOCAL_CPPFLAGS  := ${LOCAL_CFLAGS}
 
-LOCAL_C_INCLUDES  :=  \
-	${LOCAL_PATH}/../libogg-1.3.2/include \
-	${LOCAL_PATH}/include
+LOCAL_C_INCLUDES := ${LOCAL_PATH}/include
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_C_INCLUDES}
 		
 LOCAL_SRC_FILES := \
 	$(filter-out lib/psytune.c lib/vorbisenc.c, $(subst $(LOCAL_PATH)/,,\
@@ -18,4 +17,6 @@ LOCAL_SRC_FILES := \
 # $(info libvorbis: include dirs $(LOCAL_C_INCLUDES))
 # $(info libvorbis: src files $(LOCAL_SRC_FILES))
 
+LOCAL_STATIC_LIBRARIES := libogg
+
 include $(BUILD_STATIC_LIBRARY)

+ 2 - 11
love/src/jni/love/Android.mk

@@ -23,17 +23,8 @@ LOCAL_C_INCLUDES  :=  \
 	${LOCAL_PATH}/src/libraries/ \
 	${LOCAL_PATH}/src/libraries/enet/libenet/include \
 	${LOCAL_PATH}/src/libraries/physfs \
-	${LOCAL_PATH}/src/libraries/glslang/glslang/Include \
-	${LOCAL_PATH}/../SDL2-2.0.9/include \
-	${LOCAL_PATH}/../freetype2-android/include \
-	${LOCAL_PATH}/../freetype2-android/src \
-	${LOCAL_PATH}/../mpg123-1.17.0/src/libmpg123 \
-	${LOCAL_PATH}/../libmodplug-0.8.8.4/src \
-	${LOCAL_PATH}/../libvorbis-1.3.5/include \
-	${LOCAL_PATH}/../LuaJIT-2.1/src \
-	${LOCAL_PATH}/../libogg-1.3.2/include \
-	${LOCAL_PATH}/../libtheora-1.2.0alpha1/include 
-		
+	${LOCAL_PATH}/src/libraries/glslang/glslang/Include
+
 LOCAL_SRC_FILES := \
 	$(filter-out \
 	  src/modules/graphics/opengl/GLee.* \

+ 2 - 3
love/src/jni/mpg123-1.17.0/Android.mk

@@ -7,9 +7,8 @@ LOCAL_MODULE    := libmpg123
 LOCAL_CFLAGS    := -fexceptions -g -Dlinux -DHAVE_GCC_DESTRUCTOR=1 -DOPT_GENERIC -DREAL_IS_FLOAT
 LOCAL_CPPFLAGS  := ${LOCAL_CFLAGS}
 
-LOCAL_C_INCLUDES  :=  \
-	${LOCAL_PATH}/src
-
+LOCAL_C_INCLUDES := ${LOCAL_PATH}/src
+LOCAL_EXPORT_C_INCLUDES := ${LOCAL_PATH}/src/libmpg123
 LOCAL_SRC_FILES := \
 	$(filter-out \
 	  src/libmpg123/dct64_altivec.c \

+ 1 - 1
love/src/jni/openal-soft/Android.mk

@@ -108,7 +108,7 @@ LOCAL_SRC_FILES := \
 	Alc/backends/opensl.c \
 	Alc/backends/wave.c
 
-LOCAL_LDLIBS := -lOpenSLES
+LOCAL_LDLIBS := -lOpenSLES -llog
 LOCAL_STATIC_LIBRARIES :=
 
 # CPU-specific instructions

Some files were not shown because too many files changed in this diff