瀏覽代碼

Merged branch develop into master

Mark Sibly 8 年之前
父節點
當前提交
e37b2442fc
共有 100 個文件被更改,包括 31983 次插入138 次删除
  1. 二進制
      bananas/theoratest/konqi.ogv
  2. 105 0
      bananas/theoratest/theoratest.monkey2
  3. 二進制
      bananas/zipfiletest/blah.zip
  4. 24 0
      bananas/zipfiletest/zipfiletest.monkey2
  5. 二進制
      bin/mx2cc_linux
  6. 二進制
      bin/mx2cc_macos
  7. 二進制
      bin/mx2cc_windows.exe
  8. 8 0
      modules/mojo/app/window.monkey2
  9. 9 0
      modules/mojo/graphics/image.monkey2
  10. 1 1
      modules/monkey/native/bbarray.h
  11. 0 2
      modules/std/geom/affinemat3.monkey2
  12. 64 27
      modules/std/geom/affinemat4.monkey2
  13. 21 0
      modules/std/geom/box.monkey2
  14. 121 41
      modules/std/geom/mat3.monkey2
  15. 119 14
      modules/std/geom/mat4.monkey2
  16. 32 52
      modules/std/geom/quat.monkey2
  17. 6 0
      modules/std/geom/vec3.monkey2
  18. 20 0
      modules/std/geom/vec4.monkey2
  19. 36 1
      modules/std/misc/zipfile.monkey2
  20. 154 0
      modules/theoraplayer/makefile.monkey2
  21. 8 0
      modules/theoraplayer/module.json
  22. 27 0
      modules/theoraplayer/native/LICENSE
  23. 235 0
      modules/theoraplayer/native/OpenAL_AudioInterface.cpp
  24. 77 0
      modules/theoraplayer/native/OpenAL_AudioInterface.h
  25. 33 0
      modules/theoraplayer/native/monkey2_glue.cpp
  26. 21 0
      modules/theoraplayer/native/monkey2_glue.h
  27. 28 0
      modules/theoraplayer/native/ogg/COPYING
  28. 11 0
      modules/theoraplayer/native/ogg/include/ogg/config_types.h
  29. 25 0
      modules/theoraplayer/native/ogg/include/ogg/config_types.h.in
  30. 210 0
      modules/theoraplayer/native/ogg/include/ogg/ogg.h
  31. 147 0
      modules/theoraplayer/native/ogg/include/ogg/os_types.h
  32. 15 0
      modules/theoraplayer/native/ogg/libtheoraplayer-readme.txt
  33. 857 0
      modules/theoraplayer/native/ogg/src/bitwise.c
  34. 2111 0
      modules/theoraplayer/native/ogg/src/framing.c
  35. 28 0
      modules/theoraplayer/native/theora/COPYING
  36. 606 0
      modules/theoraplayer/native/theora/include/theora/codec.h
  37. 786 0
      modules/theoraplayer/native/theora/include/theora/theora.h
  38. 329 0
      modules/theoraplayer/native/theora/include/theora/theoradec.h
  39. 548 0
      modules/theoraplayer/native/theora/include/theora/theoraenc.h
  40. 53 0
      modules/theoraplayer/native/theora/lib/Version_script
  41. 82 0
      modules/theoraplayer/native/theora/lib/Version_script-dec
  42. 43 0
      modules/theoraplayer/native/theora/lib/Version_script-enc
  43. 2712 0
      modules/theoraplayer/native/theora/lib/analyze.c
  44. 166 0
      modules/theoraplayer/native/theora/lib/apiwrapper.c
  45. 54 0
      modules/theoraplayer/native/theora/lib/apiwrapper.h
  46. 304 0
      modules/theoraplayer/native/theora/lib/arm/arm2gnu.pl
  47. 231 0
      modules/theoraplayer/native/theora/lib/arm/armbits.asm
  48. 32 0
      modules/theoraplayer/native/theora/lib/arm/armbits.h
  49. 230 0
      modules/theoraplayer/native/theora/lib/arm/armbits.s
  50. 116 0
      modules/theoraplayer/native/theora/lib/arm/armcpu.c
  51. 29 0
      modules/theoraplayer/native/theora/lib/arm/armcpu.h
  52. 57 0
      modules/theoraplayer/native/theora/lib/arm/armenc.c
  53. 51 0
      modules/theoraplayer/native/theora/lib/arm/armenc.h
  54. 220 0
      modules/theoraplayer/native/theora/lib/arm/armencfrag.s
  55. 162 0
      modules/theoraplayer/native/theora/lib/arm/armenquant.s
  56. 656 0
      modules/theoraplayer/native/theora/lib/arm/armfrag.asm
  57. 655 0
      modules/theoraplayer/native/theora/lib/arm/armfrag.s
  58. 1854 0
      modules/theoraplayer/native/theora/lib/arm/armidct.asm
  59. 1853 0
      modules/theoraplayer/native/theora/lib/arm/armidct.s
  60. 126 0
      modules/theoraplayer/native/theora/lib/arm/armint.h
  61. 677 0
      modules/theoraplayer/native/theora/lib/arm/armloop.asm
  62. 676 0
      modules/theoraplayer/native/theora/lib/arm/armloop.s
  63. 39 0
      modules/theoraplayer/native/theora/lib/arm/armopts-gnu.s
  64. 39 0
      modules/theoraplayer/native/theora/lib/arm/armopts.s
  65. 39 0
      modules/theoraplayer/native/theora/lib/arm/armopts.s.in
  66. 219 0
      modules/theoraplayer/native/theora/lib/arm/armstate.c
  67. 236 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armbits.asm
  68. 32 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armbits.h
  69. 127 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.c
  70. 29 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.h
  71. 57 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armenc.c
  72. 51 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armenc.h
  73. 668 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armfrag.asm
  74. 1886 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armidct.asm
  75. 126 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armint.h
  76. 691 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armloop.asm
  77. 219 0
      modules/theoraplayer/native/theora/lib/arm_llvm/armstate.c
  78. 114 0
      modules/theoraplayer/native/theora/lib/bitpack.c
  79. 76 0
      modules/theoraplayer/native/theora/lib/bitpack.h
  80. 153 0
      modules/theoraplayer/native/theora/lib/c64x/c64xdec.c
  81. 33 0
      modules/theoraplayer/native/theora/lib/c64x/c64xdec.h
  82. 447 0
      modules/theoraplayer/native/theora/lib/c64x/c64xfrag.c
  83. 415 0
      modules/theoraplayer/native/theora/lib/c64x/c64xidct.c
  84. 67 0
      modules/theoraplayer/native/theora/lib/c64x/c64xint.h
  85. 39 0
      modules/theoraplayer/native/theora/lib/c64x/c64xstate.c
  86. 974 0
      modules/theoraplayer/native/theora/lib/collect.c
  87. 109 0
      modules/theoraplayer/native/theora/lib/collect.h
  88. 31 0
      modules/theoraplayer/native/theora/lib/dct.h
  89. 193 0
      modules/theoraplayer/native/theora/lib/decapiwrapper.c
  90. 250 0
      modules/theoraplayer/native/theora/lib/decinfo.c
  91. 186 0
      modules/theoraplayer/native/theora/lib/decint.h
  92. 2992 0
      modules/theoraplayer/native/theora/lib/decode.c
  93. 27 0
      modules/theoraplayer/native/theora/lib/defexp.awk
  94. 182 0
      modules/theoraplayer/native/theora/lib/dequant.c
  95. 27 0
      modules/theoraplayer/native/theora/lib/dequant.h
  96. 168 0
      modules/theoraplayer/native/theora/lib/encapiwrapper.c
  97. 379 0
      modules/theoraplayer/native/theora/lib/encfrag.c
  98. 121 0
      modules/theoraplayer/native/theora/lib/encinfo.c
  99. 845 0
      modules/theoraplayer/native/theora/lib/encint.h
  100. 1836 0
      modules/theoraplayer/native/theora/lib/encode.c

二進制
bananas/theoratest/konqi.ogv


+ 105 - 0
bananas/theoratest/theoratest.monkey2

@@ -0,0 +1,105 @@
+Namespace myapp
+
+#Import "<std>"
+#Import "<mojo>"
+#Import "<theoraplayer>"
+
+#Import "konqi.ogv"
+
+Using std..
+Using mojo..
+Using theoraplayer..
+
+Class MyWindow Extends Window
+
+	Field audiofactory:AudioInterfaceFactory
+	
+	Field vidman:VideoManager
+	
+	Field data:DataBuffer
+	
+	Field vidclip:VideoClip
+	
+	Field image:Image
+	
+	Field time:Double
+	
+	Field gain:float=1
+
+	Method New( title:String="Simple mojo app",width:Int=640,height:Int=480,flags:WindowFlags=WindowFlags.Resizable )
+
+		Super.New( title,width,height,flags )
+		
+		vidman=VideoManager.getInstance()
+		
+		audiofactory=New OpenAL_AudioInterfaceFactory
+		
+		vidman.setAudioInterfaceFactory( audiofactory )
+		
+		data=DataBuffer.Load( "asset::konqi.ogv" )
+		
+		vidclip=vidman.createVideoClip( data.Data,data.Length )
+'		vidclip=vidman.createVideoClip( AssetsDir()+"konqi.ogv" )
+		If Not vidclip Print "Can't load vidclip!"
+		
+		image=New Image( vidclip.getWidth(),vidclip.getHeight(),PixelFormat.RGB24,TextureFlags.Dynamic )
+		
+		vidclip.play()
+		
+		time=Now()
+		
+	End
+
+	Method OnRender( canvas:Canvas ) Override
+	
+		App.RequestRender()
+		
+		Local now:=Now()
+		
+		Local elapsed:=now-time
+		
+		time=now
+		
+		vidman.update( elapsed )
+	
+'		vidclip.updateTimerToNextFrame()	'play full speed...
+		
+		Local frame:=vidclip.fetchNextFrame()
+		
+		If frame
+		
+			Local pixmap:=New Pixmap( vidclip.getWidth(),vidclip.getHeight(),PixelFormat.RGB24,frame.getBuffer(),vidclip.getWidth()*3 )
+			
+			image.Texture.PastePixmap( pixmap,0,0 )
+			
+			vidclip.popFrame()
+		
+		Endif
+		
+		If Keyboard.KeyPressed( Key.Up )
+			gain=Min( gain+.125,1.0 )
+			vidclip.setAudioGain( gain )
+		Else If Keyboard.KeyPressed( Key.Down )
+			gain=Max( gain-.125,0.0 )
+			vidclip.setAudioGain( gain )
+		Endif
+		
+		canvas.BlendMode=BlendMode.Opaque
+		
+		canvas.DrawRect( 0,0,Width,Height,image )
+		
+		canvas.DrawText( "Time position="+vidclip.getTimePosition()+", duration="+vidclip.getDuration(),0,0 )
+		
+		canvas.DrawText( "Audio gain="+vidclip.getAudioGain(),0,16 )
+	End
+	
+End
+
+Function Main()
+
+	New AppInstance
+	
+	New MyWindow
+	
+	App.Run()
+End

二進制
bananas/zipfiletest/blah.zip


+ 24 - 0
bananas/zipfiletest/zipfiletest.monkey2

@@ -0,0 +1,24 @@
+
+'Note: ZipFile is VERY WIP!
+'
+#Import "<std>"
+
+#Import "blah.zip@/"
+
+Using std..
+
+Function Main()
+
+	Local zip:=ZipFile.Open( "asset::blah.zip" )
+	
+	For Local file:=Eachin zip.Files
+		Print file
+	Next
+	
+	Local str:=zip.ExtractData( "geom/affinemat3.monkey2" ).PeekString( 0 )
+	
+	Print str
+	
+	zip.Close()
+
+End

二進制
bin/mx2cc_linux


二進制
bin/mx2cc_macos


二進制
bin/mx2cc_windows.exe


+ 8 - 0
modules/mojo/app/window.monkey2

@@ -89,10 +89,18 @@ Class Window Extends View
 	End
 	
 	#rem monkeydoc Window fullscreen state.
+	
+	Note: The setter for this property deprecated! Please use BeginFullscreen/EndFullscreen instead.
+	
 	#end
 	Property Fullscreen:Bool()
 	
 		Return Cast<SDL_WindowFlags>( SDL_GetWindowFlags( _sdlWindow ) ) & SDL_WINDOW_FULLSCREEN
+		
+	Setter( fullscreen:Bool )
+		If fullscreen=Fullscreen Return
+	
+		If fullscreen BeginFullscreen() Else EndFullscreen()
 	End
 
 	#rem monkeydoc Window maximized state.

+ 9 - 0
modules/mojo/graphics/image.monkey2

@@ -72,6 +72,15 @@ Class Image Extends Resource
 		AddDependancy( texture )
 	End
 
+	Method New( width:Int,height:Int,format:PixelFormat,textureFlags:TextureFlags=Null,shader:Shader=Null )
+	
+		Local texture:=New Texture( width,height,format,textureFlags )
+		
+		Init( texture,texture.Rect,shader )
+		
+		AddDependancy( texture )
+	End
+
 	Method New( image:Image )
 	
 		Init( image._textures[0],image._rect,image._shader )

+ 1 - 1
modules/monkey/native/bbarray.h

@@ -190,7 +190,7 @@ template<class T,int D> bbString bbDBType( bbArray<T,D> *p ){
 
 template<class T,int D> bbString bbDBValue( bbArray<T,D> *p ){
 	char buf[64];
-	sprintf( buf,"@%p",p->_rep );
+	sprintf( buf,"@%p",*(void**)(&p->_rep) );
 	return buf;
 }
 

+ 0 - 2
modules/std/geom/affinemat3.monkey2

@@ -144,8 +144,6 @@ Struct AffineMat3<T>
 		Return New AffineMat3( sv.x,0,0,sv.y,0,0 )
 	End
 
-	#rem monkeydoc @hidden
-	#end
 	Function Ortho:AffineMat3( left:T,right:T,bottom:T,top:T )
 
 		Local w:=right-left,h:=top-bottom

+ 64 - 27
modules/std/geom/affinemat4.monkey2

@@ -15,33 +15,33 @@ Affine 4x4 matrices are often used for 3d transformations such as scaling, rotat
 Struct AffineMat4<T>
 
 	Field m:Mat3<T>
-	Field v:Vec3<T>
+	Field t:Vec3<T>
 	
 	Method New()
 		m.i.x=1; m.j.y=1; m.k.z=1
 	End
 	
-	Method New( m:Mat3<T>,v:Vec3<T> )
-		Self.m=m; Self.v=v
+	Method New( m:Mat3<T>,t:Vec3<T> )
+		Self.m=m; Self.t=t
 	End
 	
 	Method New( m:Mat3<T> )
 		Self.m=m
 	End
 	
-	Method New( v:Vec3<T> )
-		m.i.x=1; m.j.y=1; m.k.z=1 ; Self.v=v
+	Method New( t:Vec3<T> )
+		m.i.x=1; m.j.y=1; m.k.z=1 ; Self.t=t
 	End
 	
-	Method New( i:Vec3<T>,j:Vec3<T>,k:Vec3<T>,v:Vec3<T> )
-		m.i=i; m.j=j; m.k=k; Self.v=v
+	Method New( i:Vec3<T>,j:Vec3<T>,k:Vec3<T>,t:Vec3<T> )
+		m.i=i; m.j=j; m.k=k; Self.t=t
 	End
 
 	Method New( ix:T,iy:T,iz:T,jx:T,jy:T,jz:T,kx:T,ky:T,kz:T,vx:T,vy:T,vz:T )
 		m.i.x=ix; m.i.y=iy; m.i.z=iz
 		m.j.x=jx; m.j.y=jy; m.j.z=jz
 		m.k.x=kx; m.k.y=ky; m.k.z=kz
-		v.x=vx; v.y=vy; v.z=vz
+		t.x=vx; t.y=vy; t.z=vz
 	End
 	
 	#rem monkeydoc Converts the matrix to a matrix of a different type.
@@ -53,72 +53,109 @@ Struct AffineMat4<T>
 	#rem monkeydoc Converts the matrix to a printable string.
 	#end
 	Operator To:String()
-		Return "AffineMat4("+m+","+v+")"
+		Return "AffineMat4("+m+","+t+")"
 	End
 	
 	#rem monkeydoc Returns the transpose of the matrix.
 	#End
 	Operator~:AffineMat4()
 		Local i:=~m
-		Return New AffineMat4( i,i*-v )
+		Return New AffineMat4( i,i*-t )
 	End
 	
 	#rem monkeydoc Returns the inverse of the matrix.
 	#end
 	Operator-:AffineMat4()
 		Local i:=-m
-		Return New AffineMat4( i,i*-v )
+		Return New AffineMat4( i,i*-t )
 	End
 	
 	#rem monkeydoc Multiplies the matrix by another matrix and returns the result.
 	#end
 	Operator*:AffineMat4( q:AffineMat4 )
-		Return New AffineMat4( m*q.m,m*q.v+v )
+		Return New AffineMat4( m*q.m,m*q.t+t )
 	End
 	
 	#rem monkeydoc Multiplies a vector by the matrix and returns the result.
 	#end
 	Operator*:Vec3<T>( v:Vec3<T> )
 		Return New Vec3<T>( 
-			m.i.x*v.x+m.j.x*v.y+m.k.x*v.z+v.x,
-			m.i.y*v.x+m.j.y*v.y+m.k.y*v.z+v.y,
-			m.i.z*v.x+m.j.z*v.y+m.k.z*v.z+v.z )
+			m.i.x*v.x+m.j.x*v.y+m.k.x*v.z+t.x,
+			m.i.y*v.x+m.j.y*v.y+m.k.y*v.z+t.y,
+			m.i.z*v.x+m.j.z*v.y+m.k.z*v.z+t.z )
 	End
 
 	#rem monkeydoc Applies a translation transformation to the matrix and returns the result.
 	#end
-	Method Translate:AffineMat4( tv:Vec3<T> )
-		Return Self * TranslationMatrix( tv )
+	Method Translate:AffineMat4( tx:T,ty:T,tz:T )
+		Return Self * Translation( tx,ty,tz )
 	End
 	
+	Method Translate:AffineMat4( tv:Vec3<T> )
+		Return Self * Translation( tv )
+	End
+
 	#rem monkeydoc Applies a rotation transformation to the matrix and returns the result.
 	#end
-	Method Rotate:AffineMat4( rv:Vec3<T> )
-		Return Self * RotationMatrix( rv )
+	Method Rotate:AffineMat4( rx:Double,ry:Double,rz:Double )
+		Return Self * Rotation( rx,ry,rz )
+	End
+
+	Method Rotate:AffineMat4( rv:Vec3<Double> )
+		Return Self * Rotation( rv )
 	End
 	
 	#rem monkeydoc Applies a scaling transformation to the matrix and returns the result.
 	#end
-	Method Scale:AffineMat4( rv:Vec3<T> )
-		Return Self * ScalingMatrix( rv )
+	Method Scale:AffineMat4( sx:T,sy:T,sz:T )
+		Return Self * Scaling( sx,sy,sz )
+	End
+	
+	Method Scale:AffineMat4( sv:Vec3<T> )
+		Return Self * Scaling( sv )
 	End
 	
 	#rem monkeydoc Creates a translation matrix.
 	#end
-	Function TranslationMatrix:AffineMat4( tv:Vec3<T> )
+	Function Translation:AffineMat4( tv:Vec3<T> )
 		Return New AffineMat4( tv )
 	End
 	
-	#rem monkeydoc Creates a rotation matrix.
+	Function Translation:AffineMat4( tx:T,ty:T,tz:T )
+		Return New AffineMat4( New Vec3<T>( tx,ty,tz ) )
+	End
+
+	#rem monkeydoc Creates a rotation matrix from a quaternion.
 	#end
-	Function RotationMatrix:AffineMat4( rv:Vec3<T> )
-		Return New AffineMat4( Mat3<T>.RotationMatrix( rv ) )
+	Function Rotation:AffineMat4( quat:Quat<T> )
+		Return New AffineMat4( Mat3<T>.Rotation( quat ) )
+	End
+	
+	#rem monkeydoc Creates a rotation matrix from euler angles.
+	
+	Order of rotation is Yaw * Pitch * Roll.
+	
+	#end
+	Function Rotation:AffineMat4( rv:Vec3<Double> )
+		Return New AffineMat4( Mat3<T>.Rotation( rv ) )
+	End
+	
+	Function Rotation:AffineMat4( rx:Double,ry:Double,rz:Double )
+		Return New AffineMat4( Mat3<T>.Rotation( rx,ry,rz ) )
 	End
 	
 	#rem monkeydoc Creates a scaling matrix.
 	#end
-	Function ScalingMatrix:AffineMat4( sv:Vec3<T> )
-		Return New AffineMat4( Mat3<T>.ScalingMatrix( sv ) )
+	Function Scaling:AffineMat4( sv:Vec3<T> )
+		Return New AffineMat4( Mat3<T>.Scaling( sv ) )
+	End
+	
+	Function Scaling:AffineMat4( sx:T,sy:T,sz:T )
+		Return New AffineMat4( Mat3<T>.Scaling( sx,sy,sz ) )
+	End
+	
+	Function Scaling:AffineMat4( t:T )
+		Return Scaling( t,t,t )
 	End
 	
 End

+ 21 - 0
modules/std/geom/box.monkey2

@@ -9,12 +9,21 @@ Alias Boxf:Box<Float>
 #end
 Struct Box<T>
 
+	Const FullBounds:=New Box( -1000000,-1000000,-1000000,1000000,1000000,1000000 )
+
+	Const EmptyBounds:=New Box( 1000000,1000000,1000000,-1000000,-1000000,-1000000 )
+	
 	Field min:Vec3<T>
 	Field max:Vec3<T>
 	
 	Method New()
 	End
 	
+	Method New( min:T,max:T )
+		Self.min=New Vec3<T>( min )
+		Self.max=New Vec3<T>( max )
+	End
+	
 	Method New( min:Vec3<T>,max:Vec3<T> )
 		Self.min=min
 		Self.max=max
@@ -25,10 +34,22 @@ Struct Box<T>
 		max.x=x1;max.y=y1;max.z=z1
 	End
 	
+	Operator To:String()
+		Return "Box("+min+","+max+")"
+	End
+	
+	Operator To<C>:Box<C>()
+		Return New Box<C>( min,max )
+	End
+	
 	Property Empty:Bool()
 		Return max.x<=min.x Or max.y<=min.y Or max.z<=min.z
 	End
 	
+	Property Center:Vec3<T>()
+		Return (min+max)/2
+	End
+	
 	Property Size:Vec3<T>()
 		Return max-min
 	End

+ 121 - 41
modules/std/geom/mat3.monkey2

@@ -22,12 +22,6 @@ Struct Mat3<T>
 	End
 	
 	Method New( q:Quat<T> )
-		Local xx:=q.v.x*q.v.x , yy:=q.v.y*q.v.y , zz:=q.v.z*q.v.z
-		Local xy:=q.v.x*q.v.y , xz:=q.v.x*q.v.z , yz:=q.v.y*q.v.z
-		Local wx:=q.w*q.v.x   , wy:=q.w*q.v.y   , wz:=q.w*q.v.z
-		i.x=1-2*(yy+zz) ; i.y=  2*(xy-wz) ; i.z=  2*(xz+wy)
-		j.x=  2*(xy+wz) ; j.y=1-2*(xx+zz) ; j.z=  2*(yz-wx)
-		k.x=  2*(xz-wy) ; k.y=  2*(yz+wx) ; k.z=1-2*(xx+yy)
 	End
 	
 	Method New( ix:Float,jy:Float,kz:Float )
@@ -40,39 +34,16 @@ Struct Mat3<T>
 		k.x=kx; k.y=ky; k.z=kz
 	End
 	
-	Method To<C>:Mat3<C>()
+	Operator To<C>:Mat3<C>()
 		Return New Mat3<C>( i,j,k )
 	End
 	
-	Method To:String()
+	Operator To:String()
 		Return "Mat3("+i+","+j+","+k+")"
 	End
 	
-	Method To:Quat<T>()
-		Return New Quat<T>( Self )
-	End
-	
-	Property Determinant:Double()
-		return i.x*(j.y*k.z-j.z*k.y )-i.y*(j.x*k.z-j.z*k.x )+i.z*(j.x*k.y-j.y*k.x )
-	End
-	
-	Property Cofactor:Mat3()
-		Return New Mat3(
-			 (j.y*k.z-j.z*k.y),-(j.x*k.z-j.z*k.x), (j.x*k.y-j.y*k.x),
-			-(i.y*k.z-i.z*k.y), (i.x*k.z-i.z*k.x),-(i.x*k.y-i.y*k.x),
-			 (i.y*j.z-i.z*j.y),-(i.x*j.z-i.z*j.x), (i.x*j.y-i.y*j.x) )
-	End
-	
-	Property Pitch:Double()
-		Return k.Pitch
-	End
-	
-	Property Yaw:Double()
-		Return k.Yaw
-	End
-	
-	Property Roll:Double()
-		Return ATan2( i.y,j.y )
+	Property Determinant:T()
+		Return i.x*(j.y*k.z-j.z*k.y )-i.y*(j.x*k.z-j.z*k.x )+i.z*(j.x*k.y-j.y*k.x )
 	End
 	
 	Operator~:Mat3()
@@ -102,12 +73,83 @@ Struct Mat3<T>
 		Return New Vec3<T>( i.x*v.x+j.x*v.y+k.x*v.z,i.y*v.x+j.y*v.y+k.y*v.z,i.z*v.x+j.z*v.y+k.z*v.z )
 	End
 	
+	Method GetCofactor:Mat3()
+		Return New Mat3(
+			 (j.y*k.z-j.z*k.y),-(j.x*k.z-j.z*k.x), (j.x*k.y-j.y*k.x),
+			-(i.y*k.z-i.z*k.y), (i.x*k.z-i.z*k.x),-(i.x*k.y-i.y*k.x),
+			 (i.y*j.z-i.z*j.y),-(i.x*j.z-i.z*j.x), (i.x*j.y-i.y*j.x) )
+	End
+	
+	Method GetPitch:Double()
+		Return k.Pitch
+	End
+	
+	Method GetYaw:Double()
+		Return k.Yaw
+	End
+	
+	Method GetRoll:Double()
+		Return ATan2( i.y,j.y )
+	End
+	
+	Method GetRotation:Vec3<T>()
+		Return New Vec3<T>( GetPitch(),GetYaw(),GetRoll() )
+	End
+	
+	Method GetQuat:Quat<T>()
+		Local r:Quat<T>
+		Local m:=Orthogonalize()
+		Local t:=m.i.x+m.j.y+m.k.z
+		If t>EPSILON
+			t=Sqrt( t+1 )*2
+			r.v.x=(m.k.y-m.j.z)/t
+			r.v.y=(m.i.z-m.k.x)/t
+			r.v.z=(m.j.x-m.i.y)/t
+			r.w=t/4
+		Else If m.i.x>m.j.y And m.i.x>m.k.z
+			t=Sqrt( m.i.x-m.j.y-m.k.z+1 )*2
+			r.v.x=t/4
+			r.v.y=(m.j.x+m.i.y)/t
+			r.v.z=(m.i.z+m.k.x)/t
+			r.w=(m.k.y-m.j.z)/t
+		Else If m.j.y>m.k.z
+			t=Sqrt( m.j.y-m.k.z-m.i.x+1 )*2
+			r.v.x=(m.j.x+m.i.y)/t
+			r.v.y=t/4
+			r.v.z=(m.k.y+m.j.z)/t
+			r.w=(m.i.z-m.k.x)/t
+		Else
+			t=Sqrt( m.k.z-m.j.y-m.i.x+1 )*2
+			r.v.x=(m.i.z+m.k.x)/t
+			r.v.y=(m.k.y+m.j.z)/t
+			r.v.z=t/4
+			r.w=(m.j.x-m.i.y)/t
+		Endif
+		Return r
+	End
+	
+	Method GetScaling:Vec3<T>()
+		Return New Vec3<T>( i.Length,j.Length,k.Length )
+	End
+	
 	Method Rotate:Mat3( rv:Vec3<T> )
-		Return Self * RotationMatrix( rv )
+		Return Self * Rotation( rv )
+	End
+	
+	Method Rotate:Mat3( rx:Double,ry:Double,rz:Double )
+		Return Self * Rotation( rx,ry,rz )
 	End
 	
 	Method Scale:Mat3( rv:Vec3<T> )
-		Return Self * ScalingMatrix( rv )
+		Return Self * Scaling( rv )
+	End
+
+	Method Scale:Mat3( sx:T,sy:T,sz:T )
+		Return Self * Scaling( sx,sy,sz )
+	End
+	
+	Method Scale:Mat3( t:T )
+		Return Self * Scaling( t )
 	End
 
 	Method Orthogonalize:Mat3()
@@ -115,27 +157,65 @@ Struct Mat3<T>
 		Return New Mat3( j.Cross( k ).Normalize(),k.Cross( i ).Normalize(),k )
 	End
 	
-	Function YawMatrix:Mat3( an:Double )
+	#rem monkeydoc Creates a yaw (y axis) rotation matrix.
+	#end
+	Function Yaw:Mat3( an:Double )
 		Local sin:=Sin(an),cos:=Cos(an)
 		Return New Mat3( cos,0,sin, 0,1,0, -sin,0,cos )
 	End
 	
-	Function PitchMatrix:Mat3( an:Double )
+	#rem monkeydoc Creates a pitch (x axis) rotation matrix.
+	#end
+	Function Pitch:Mat3( an:Double )
 		Local sin:=Sin(an),cos:=Cos(an)
 		return New Mat3( 1,0,0, 0,cos,sin, 0,-sin,cos )
 	End
 	
-	Function RollMatrix:Mat3( an:Double )
+	#rem monkeydoc Creates a roll (z axis) rotation matrix.
+	#end
+	Function Roll:Mat3( an:Double )
 		Local sin:=Sin(an),cos:=Cos(an)
 		Return New Mat3( cos,sin,0, -sin,cos,0, 0,0,1 )
 	End
 	
-	Function RotationMatrix:Mat3( rv:Vec3<T> )
-		Return YawMatrix( rv.y ) * PitchMatrix( rv.x ) * RollMatrix( rv.z )
+	#rem monkeydoc Creates a rotation matrix from a quaternion.
+	#end
+	Function Rotation:Mat3( quat:Quat<T> )
+		Local xx:=quat.v.x*quat.v.x , yy:=quat.v.y*quat.v.y , zz:=quat.v.z*quat.v.z
+		Local xy:=quat.v.x*quat.v.y , xz:=quat.v.x*quat.v.z , yz:=quat.v.y*quat.v.z
+		Local wx:=quat.w*quat.v.x   , wy:=quat.w*quat.v.y   , wz:=quat.w*quat.v.z
+		Local r:Mat3
+		r.i.x=1-2*(yy+zz) ; r.i.y=  2*(xy-wz) ; r.i.z=  2*(xz+wy)
+		r.j.x=  2*(xy+wz) ; r.j.y=1-2*(xx+zz) ; r.j.z=  2*(yz-wx)
+		r.k.x=  2*(xz-wy) ; r.k.y=  2*(yz+wx) ; r.k.z=1-2*(xx+yy)
+		Return r
+	End
+	
+	#rem monkeydoc Creates a rotation matrix from euler angles.
+	
+	Order of rotation is Yaw * Pitch * Roll.
+	
+	#end
+	Function Rotation:Mat3( rv:Vec3<Double> )
+		Return Yaw( rv.y ) * Pitch( rv.x ) * Roll( rv.z )
 	End
 	
-	Function ScalingMatrix:Mat3( sv:Vec3<T> )
+	Function Rotation:Mat3( rx:Double,ry:Double,rz:Double )
+		Return Yaw( ry ) * Pitch( rx ) * Roll( rz )
+	End
+
+	#rem monkeydoc Creates a scaling matrix.
+	#end
+	Function Scaling:Mat3( sv:Vec3<T> )
 		Return New Mat3( sv.x,sv.y,sv.z )
 	End
 
+	Function Scaling:Mat3( sx:T,sy:T,sz:T )
+		Return New Mat3( sx,sy,sz )
+	End
+	
+	Function Scaling:Mat3( t:T )
+		Return New Mat3( t,t,t )
+	End
+	
 End

+ 119 - 14
modules/std/geom/mat4.monkey2

@@ -26,11 +26,16 @@ Struct Mat4<T>
 		Self.i=i;Self.j=j;Self.k=k;Self.t=t
 	End
 	
+	Method New( m:Mat3<T> )
+		i.XYZ=m.i ; j.XYZ=m.j ; k.XYZ=m.k ; t.w=1
+	End
+	
 	Method New( m:AffineMat3<T> )
-		i.x=m.i.x;i.y=m.i.y
-		j.x=m.j.x;j.y=m.j.y
-		k.z=1
-		t.x=m.t.x;t.y=m.t.y;t.w=1
+		i.XY=m.i ; j.XY=m.j ; k.z=1 ; t.XY=m.t ; t.w=1
+	End
+	
+	Method New( m:AffineMat4<T> )
+		i.XYZ=m.m.i ; j.XYZ=m.m.j ; k.XYZ=m.m.k ; t.XYZ=m.t ; t.w=1
 	End
 	
 	Operator*:Mat4( m:Mat4 )
@@ -59,25 +64,125 @@ Struct Mat4<T>
 		Return r
 	End
 	
-	Function Translation:Mat4( tx:T,ty:T,tz:T )
-		Local r:=New Mat4
-		r.t.x=tx;r.t.y=ty;r.t.z=tz
+	Operator*:Mat4( m:AffineMat4<T> )
+
+		Local r:Mat4
+		
+		r.i.x=i.x*m.m.i.x + j.x*m.m.i.y + k.x*m.m.i.z
+		r.i.y=i.y*m.m.i.x + j.y*m.m.i.y + k.y*m.m.i.z
+		r.i.z=i.z*m.m.i.x + j.z*m.m.i.y + k.z*m.m.i.z
+		r.i.w=i.w*m.m.i.x + j.w*m.m.i.y + k.w*m.m.i.z
+		
+		r.j.x=i.x*m.m.j.x + j.x*m.m.j.y + k.x*m.m.j.z
+		r.j.y=i.y*m.m.j.x + j.y*m.m.j.y + k.y*m.m.j.z
+		r.j.z=i.z*m.m.j.x + j.z*m.m.j.y + k.z*m.m.j.z
+		r.j.w=i.w*m.m.j.x + j.w*m.m.j.y + k.w*m.m.j.z
+		
+		r.k.x=i.x*m.m.k.x + j.x*m.m.k.y + k.x*m.m.k.z
+		r.k.y=i.y*m.m.k.x + j.y*m.m.k.y + k.y*m.m.k.z
+		r.k.z=i.z*m.m.k.x + j.z*m.m.k.y + k.z*m.m.k.z
+		r.k.w=i.w*m.m.k.x + j.w*m.m.k.y + k.w*m.m.k.z
+		
+		r.t.x=i.x*m.t.x   + j.x*m.t.y   + k.x*m.t.z + t.x
+		r.t.y=i.y*m.t.x   + j.y*m.t.y   + k.y*m.t.z + t.y
+		r.t.z=i.z*m.t.x   + j.z*m.t.y   + k.z*m.t.z + t.z
+		r.t.w=i.w*m.t.x   + j.w*m.t.y   + k.w*m.t.z + t.w
+		
 		Return r
 	End
 	
-	Function Scale:Mat4( sx:Float,sy:Float,sz:Float )
+	Operator*:Mat4( m:Mat3<T> )
+
 		Local r:Mat4
-		r.i.x=sx;r.j.y=sy;r.k.z=sz;r.t.w=1
+		
+		r.i.x=i.x*m.i.x + j.x*m.i.y + k.x*m.i.z
+		r.i.y=i.y*m.i.x + j.y*m.i.y + k.y*m.i.z
+		r.i.z=i.z*m.i.x + j.z*m.i.y + k.z*m.i.z
+		r.i.w=i.w*m.i.x + j.w*m.i.y + k.w*m.i.z
+		
+		r.j.x=i.x*m.j.x + j.x*m.j.y + k.x*m.j.z
+		r.j.y=i.y*m.j.x + j.y*m.j.y + k.y*m.j.z
+		r.j.z=i.z*m.j.x + j.z*m.j.y + k.z*m.j.z
+		r.j.w=i.w*m.j.x + j.w*m.j.y + k.w*m.j.z
+		
+		r.k.x=i.x*m.k.x + j.x*m.k.y + k.x*m.k.z
+		r.k.y=i.y*m.k.x + j.y*m.k.y + k.y*m.k.z
+		r.k.z=i.z*m.k.x + j.z*m.k.y + k.z*m.k.z
+		r.k.w=i.w*m.k.x + j.w*m.k.y + k.w*m.k.z
+		
+		r.t.x=t.x
+		r.t.y=t.y
+		r.t.z=t.z
+		r.t.w=t.w
+		
+		Return r
+	End
+	
+	#rem monkeydoc Creates a translation matrix.
+	#end
+	Function Translation:Mat4( tv:Vec3<T> )
+		Return Translation( tv.x,tv.y,tv.z )
+	End
+	
+	Function Translation:Mat4( tx:T,ty:T,tz:T )
+		Local r:=New Mat4
+		r.t.x=tx;r.t.y=ty;r.t.z=tz;r.t.w=1
 		Return r
 	End
+
+	#rem monkeydoc Creates a rotation matrix.
+	#end
+	Function Rotation:Mat4( rv:Vec3<Double> )
+		Return Rotation( rv.x,rv.y,rv.z )
+	End
+	
+	Function Rotation:Mat4( rx:Double,ry:Double,rz:Double )
+		Return New Mat4( Mat3<T>.Rotation( rx,ry,rz ) )
+	End
+	
+	#rem monkeydoc Creates a scaling matrix.
+	#end
+	Function Scaling:Mat4( sx:T,sy:T,sz:T )
+		Return New Mat4( sx,sy,sz,1 )
+	End
+	
+	Function Scaling:Mat4( sv:Vec3<T> )
+		Return Scaling( sv.x,sv.y,sv.z )
+	End
 	
-	Function Ortho:Mat4( left:Float,right:Float,bottom:Float,top:Float,near:Float,far:Float )
+	Function Scaling:Mat4( t:T )
+		Return Scaling( t,t,t )
+	End
 
-		Local w:=right-left,h:=top-bottom,d:=far-near
+	#rem monkeydoc Creates an orthographic projection matrix.
+	#End	
+	Function Ortho:Mat4( left:Double,right:Double,bottom:Double,top:Double,near:Double,far:Double )
 
-		Local r:Mat4
-		r.i.x=2/w ; r.j.y=2/h ; r.k.z=2/d
-		r.t=New Vec4<T>( -(right+left)/w,-(top+bottom)/h,-(far+near)/d,1 )
+		Local w:=right-left,h:=top-bottom,d:=far-near,r:Mat4
+
+		r.i.x=2/w
+		r.j.y=2/h
+		r.k.z=2/d
+		r.t.x=-(right+left)/w
+		r.t.y=-(top+bottom)/h
+		r.t.z=-(far+near)/d
+		r.t.w=1
+
+		Return r
+	End
+	
+	Function Frustum:Mat4( left:Double,right:Double,bottom:Double,top:Double,near:Double,far:Double )
+	
+		Local w:=right-left,h:=top-bottom,d:=far-near,near2:=near*2,r:Mat4
+
+		r.i.x=near2/w
+		r.j.y=near2/h
+		r.k.x=(right+left)/w
+		r.k.y=(top+bottom)/h
+		r.k.z=(far+near)/d
+		r.k.w=1
+		r.t.z=-(far*near2)/d
+		
 		Return r
 	End
 	

+ 32 - 52
modules/std/geom/quat.monkey2

@@ -28,36 +28,6 @@ Class Quat<T>
 		v.x=vx ; v.y=vy ; v.z=vz ; Self.w=w
 	End
 	
-	Method New( m:Mat3<T> )
-		m=m.Orthogonalize()
-		Local t:=m.i.x+m.j.y+m.k.z
-		If t>EPSILON
-			t=Sqrt( t+1 )*2
-			v.x=(m.k.y-m.j.z)/t
-			v.y=(m.i.z-m.k.x)/t
-			v.z=(m.j.x-m.i.y)/t
-			w=t/4
-		Else If m.i.x>m.j.y And m.i.x>m.k.z
-			t=Sqrt( m.i.x-m.j.y-m.k.z+1 )*2
-			v.x=t/4
-			v.y=(m.j.x+m.i.y)/t
-			v.z=(m.i.z+m.k.x)/t
-			w=(m.k.y-m.j.z)/t
-		Else If m.j.y>m.k.z
-			t=Sqrt( m.j.y-m.k.z-m.i.x+1 )*2
-			v.x=(m.j.x+m.i.y)/t
-			v.y=t/4
-			v.z=(m.k.y+m.j.z)/t
-			w=(m.i.z-m.k.x)/t
-		Else
-			t=Sqrt( m.k.z-m.j.y-m.i.x+1 )*2
-			v.x=(m.i.z+m.k.x)/t
-			v.y=(m.k.y+m.j.z)/t
-			v.z=t/4
-			w=(m.j.x-m.i.y)/t
-		Endif
-	End
-	
 	Operator To<C>:Quat<C>()
 		Return New Quat<C>( v,w )
 	End
@@ -66,10 +36,6 @@ Class Quat<T>
 		Return "Quat("+v+","+w+")"
 	End
 	
-	Operator To:Mat3<T>()
-		Return New Mat3<T>( Self )
-	End
-	
 	Property Length:Double()
 		Return Sqrt( v.Dot(v) + w*w )
 	End
@@ -85,26 +51,14 @@ Class Quat<T>
 		Local yz:=v.y*v.z , wx:=w*v.x
 		Local xy:=v.x*v.y , wz:=w*v.z
 		Local xx:=v.x*v.x , zz:=v.z*v.z
-		return New Vec3<T>( 2*(xy+wz),1-2*(xx+zz),2*(yz-wx) )
+		Return New Vec3<T>( 2*(xy+wz),1-2*(xx+zz),2*(yz-wx) )
 	End
 	
 	Property K:Vec3<T>()
 		Local xz:=v.x*v.z , wy:=w*v.y
 		Local yz:=v.y*v.z , wx:=w*v.x
 		Local xx:=v.x*v.x , yy:=v.y*v.y
-		return New Vec3<T>( 2*(xz-wy),2*(yz+wx),1-2*(xx+yy) )
-	End
-	
-	Property Yaw:Double()
-		Return K.Yaw
-	End
-	
-	Property Pitch:Double()
-		Return K.Pitch
-	End
-	
-	Property Roll:Double()
-		Return ATan2( I.y,J.y )
+		Return New Vec3<T>( 2*(xz-wy),2*(yz+wx),1-2*(xx+yy) )
 	End
 	
 	Operator-:Quat()
@@ -135,6 +89,18 @@ Class Quat<T>
 		Return New Quat( v/t,w/t )
 	End
 	
+	Method GetYaw:Double()
+		Return K.Yaw
+	End
+	
+	Method GetPitch:Double()
+		Return K.Pitch
+	End
+	
+	Method GetRoll:Double()
+		Return ATan2( I.y,J.y )
+	End
+	
 	Method Dot:Double( q:Quat )
 		Return v.x*q.v.x + v.y*q.v.y + v.z*q.v.z + w*q.w
 	End
@@ -161,10 +127,24 @@ Class Quat<T>
 		Return Self*b + t*a
 	End
 	
-'	Function RotationQuat:Quat( rv:Vec3<T> )
-'	End
+	Function Pitch:Quat( r:Double )
+		Return New Quat( Sin( r/2 ),0,0,Cos( r/2 ) )
+	End
+
+	Function Yaw:Quat( r:Double )
+		Return New Quat( 0,Sin( r/2 ),0,Cos( r/2 ) )
+	End
+
+	Function Roll:Quat( r:Double )
+		Return New Quat( 0,0,Sin( r/2 ),Cos( r/2 ) )
+	End
+
+	Function Rotation:Quat( rv:Vec3<Double> )
+		Return Yaw( rv.y ) * Pitch( rv.x ) * Roll( rv.z )
+	End
 	
-'	Function AxisAngleQuat:Quat( axis:Vec3<T>,angle:Double )
-'	End
+	Function Rotation:Quat( rx:Double,ry:Double,rz:Double )
+		Return Yaw( ry ) * Pitch( rx ) * Roll( rz )
+	End
 
 End

+ 6 - 0
modules/std/geom/vec3.monkey2

@@ -54,6 +54,12 @@ Struct Vec3<T>
 		Self.z=z
 	End
 	
+	Property XY:Vec2<T>()
+		Return New Vec2<T>( x,y )
+	Setter( xy:Vec2<T> )
+		x=xy.x;y=xy.y
+	End
+	
 	Operator-:Vec3()
 		Return New Vec3( -x,-y,-z )
 	End

+ 20 - 0
modules/std/geom/vec4.monkey2

@@ -57,6 +57,26 @@ Struct Vec4<T>
 		Self.w=w
 	End
 	
+	Property XY:Vec2<T>()
+		Return New Vec2<T>( x,y )
+	Setter( xy:Vec2<T> )
+		x=xy.x;y=xy.y
+	End
+	
+	Property XYZ:Vec3<T>()
+		Return New Vec3<T>( x,y,z )
+	Setter( xyz:Vec3<T> )
+		x=xyz.x;y=xyz.y;z=xyz.z
+	End
+	
+	Operator To:String()
+		Return "Vec4("+x+","+y+","+z+","+w+")"
+	End
+	
+	Operator To<C>:Vec4<C>()
+		Return New Vec4<C>( x,y,z,w )
+	End
+	
 	Operator-:Vec4()
 		Return New Vec4( -x,-y,-z,-w )
 	End

+ 36 - 1
modules/std/misc/zipfile.monkey2

@@ -3,21 +3,54 @@ Namespace std.zipfile
 
 Using miniz
 
-#rem monkeydoc @hidden
+#rem monkeydoc The ZipFile class.
 #end
 Class ZipFile
 
+	#rem monkeydoc Array of all files contained in the zip.
+	#end
 	Property Files:String[]()
 		Return _files
 	End
 	
+	#rem monkeydoc Closes the zip.
+	#end
 	Method Close()
 		If Not _data Return
 		libc.free( _zip )
+		_files=Null
+		_sizes=Null
 		_data.Discard()
 		_data=Null
 	End
 	
+	#rem monkeydoc Checks if a file is contained in the zip.
+	#end
+	Method Contains:Bool( file:String )
+	
+		Return FindFile( file )<>-1
+	End
+	
+	#rem moneydoc Extracts a file from the zip into a databuffer.
+	#end
+	Method ExtractData:DataBuffer( file:String )
+	
+		Local i:=FindFile( file )
+		If i=-1 Return Null
+
+		Local size:=_sizes[i]
+
+		Local buf:=New DataBuffer( size )
+		
+		If mz_zip_reader_extract_to_mem( _zip,i,buf.Data,size,0 ) Return buf
+		
+		buf.Discard()
+		
+		Return Null
+	End
+
+	#rem monkeydoc @hidden
+	#end	
 	Method FindFile:Int( file:String )
 
 		For Local i:=0 Until _files.Length
@@ -27,6 +60,8 @@ Class ZipFile
 		Return -1
 	End
 	
+	#rem monkeydoc @hidden
+	#end	
 	Method Extract:Bool( dir:String,prefix:String="" )
 	
 		If Not dir.EndsWith( "/" ) dir+="/"

+ 154 - 0
modules/theoraplayer/makefile.monkey2

@@ -0,0 +1,154 @@
+
+Namespace theoraplayer
+
+'***** ogg *****
+'
+#Import "native/ogg/src/bitwise.c"
+#Import "native/ogg/src/framing.c"
+
+'***** vorbis *****
+'
+#Import "native/vorbis/lib/analysis.c"
+'#Import "native/vorbis/lib/barkmel.c"
+#Import "native/vorbis/lib/bitrate.c"
+#Import "native/vorbis/lib/block.c"
+#Import "native/vorbis/lib/codebook.c"
+#Import "native/vorbis/lib/envelope.c"
+#Import "native/vorbis/lib/floor0.c"
+#Import "native/vorbis/lib/floor1.c"
+#Import "native/vorbis/lib/info.c"
+#Import "native/vorbis/lib/lookup.c"
+#Import "native/vorbis/lib/lpc.c"
+#Import "native/vorbis/lib/lsp.c"
+#Import "native/vorbis/lib/mapping0.c"
+#Import "native/vorbis/lib/mdct.c"
+#Import "native/vorbis/lib/psy.c"
+'#Import "native/vorbis/lib/psytune.c"
+#Import "native/vorbis/lib/registry.c"
+#Import "native/vorbis/lib/res0.c"
+#Import "native/vorbis/lib/sharedbook.c"
+#Import "native/vorbis/lib/smallft.c"
+#Import "native/vorbis/lib/synthesis.c"
+'#Import "native/vorbis/lib/tone.c"
+#Import "native/vorbis/lib/vorbisenc.c"
+#Import "native/vorbis/lib/vorbisfile.c"
+#Import "native/vorbis/lib/window.c"
+
+'***** theora *****
+'
+'#Import "native/theora/lib/analyze.c"
+#Import "native/theora/lib/apiwrapper.c"
+#Import "native/theora/lib/bitpack.c"
+#Import "native/theora/lib/collect.c"
+#Import "native/theora/lib/decapiwrapper.c"
+#Import "native/theora/lib/decinfo.c"
+#Import "native/theora/lib/decode.c"
+#Import "native/theora/lib/dequant.c"
+'#Import "native/theora/lib/encapiwrapper.c"
+'#Import "native/theora/lib/encfrag.c"
+'#Import "native/theora/lib/encinfo.c"
+'#Import "native/theora/lib/encode.c"
+#Import "native/theora/lib/encoder_disabled.c"
+#Import "native/theora/lib/enquant.c"
+#Import "native/theora/lib/fdct.c"
+#Import "native/theora/lib/fragment.c"
+#Import "native/theora/lib/huffdec.c"
+#Import "native/theora/lib/huffenc.c"
+#Import "native/theora/lib/idct.c"
+#Import "native/theora/lib/info.c"
+#Import "native/theora/lib/internal.c"
+#Import "native/theora/lib/mathops.c"
+'#Import "native/theora/lib/mcenc.c"
+#Import "native/theora/lib/quant.c"
+#Import "native/theora/lib/rate.c"
+#Import "native/theora/lib/state.c"
+#Import "native/theora/lib/tokenize.c"
+
+'#Import "native/theora/lib/x86/mmxencfrag.c
+'#Import "native/theora/lib/x86/mmxfdct.c"
+'#Import "native/theora/lib/x86/mmxfrag.c"
+'#Import "native/theora/lib/x86/mmxidct.c"
+'#Import "native/theora/lib/x86/mmxstate.c"
+'#Import "native/theora/lib/x86/sse2encfrag.c"
+'#Import "native/theora/lib/x86/sse2fdct.c"
+'#Import "native/theora/lib/x86/sse2idct.c"
+'#Import "native/theora/lib/x86/x86cpu.c"
+'#Import "native/theora/lib/x86/x86enc.c"
+'#Import "native/theora/lib/x86/x86enquant.c"
+'#Import "native/theora/lib/x86/x86state.c"
+
+'***** theoraplayer *****
+'
+#Import "native/theoraplayer/src/AudioInterface.cpp"
+#Import "native/theoraplayer/src/AudioInterfaceFactory.cpp"
+#Import "native/theoraplayer/src/AudioPacketQueue.cpp"
+#Import "native/theoraplayer/src/DataSource.cpp"
+#Import "native/theoraplayer/src/Exception.cpp"
+#Import "native/theoraplayer/src/FileDataSource.cpp"
+#Import "native/theoraplayer/src/FrameQueue.cpp"
+#Import "native/theoraplayer/src/Manager.cpp"
+#Import "native/theoraplayer/src/MemoryDataSource.cpp"
+#Import "native/theoraplayer/src/Mutex.cpp"
+#Import "native/theoraplayer/src/theoraplayer.cpp"
+#Import "native/theoraplayer/src/Thread.cpp"
+#Import "native/theoraplayer/src/Timer.cpp"
+#Import "native/theoraplayer/src/Utility.cpp"
+#Import "native/theoraplayer/src/VideoClip.cpp"
+#Import "native/theoraplayer/src/VideoFrame.cpp"
+#Import "native/theoraplayer/src/WorkerThread.cpp"
+
+#Import "native/theoraplayer/src/YUV/yuv_util.c"
+
+#Import "native/theoraplayer/src/formats/Theora/VideoClip_Theora.cpp"
+
+#Import "native/theoraplayer/src/YUV/C/yuv420_grey_c.c"
+#Import "native/theoraplayer/src/YUV/C/yuv420_rgb_c.c"
+#Import "native/theoraplayer/src/YUV/C/yuv420_yuv_c.c"
+
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_common.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_gcc.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_neon.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_neon64.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_posix.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_win.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_argb.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_from.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_from_argb.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_jpeg.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_to_argb.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_to_i420.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/cpu_id.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/mjpeg_decoder.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/mjpeg_validate.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/planar_functions.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_argb.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_mips.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_neon.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_neon64.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_any.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_common.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_gcc.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_mips.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_neon.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_neon64.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_posix.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/row_win.cc"
+'#Import "native/theoraplayer/src/YUV/libyuv/src/row_x86.asm"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_any.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_argb.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_argb_neon.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_common.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_gcc.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_mips.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_neon.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_neon64.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_posix.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_win.cc"
+#Import "native/theoraplayer/src/YUV/libyuv/src/video_common.cc"
+'#Import "native/theoraplayer/src/YUV/libyuv/src/x86inc.asm"
+
+#Import "native/theoraplayer/src/YUV/libyuv/yuv_libyuv.c"

+ 8 - 0
modules/theoraplayer/module.json

@@ -0,0 +1,8 @@
+{
+	"module":"theoraplayer",
+	"about":"Minimal theora player",
+	"author":"Kresimir Spes",
+	"version":"1.0.0",
+	"support":"http://monkey2.monkey-x.com",
+	"depends":["openal","libc"]
+}

+ 27 - 0
modules/theoraplayer/native/LICENSE

@@ -0,0 +1,27 @@
+Copyright (c) Kresimir Spes ([email protected])
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1.	Redistributions of source code must retain the above copyright notice,
+	this list of conditions and the following disclaimer.
+
+2.	Redistributions in binary form must reproduce the above copyright notice,
+	this list of conditions and the following disclaimer in the documentation
+	and/or other materials provided with the distribution.
+
+3.	Neither the name of the copyright holder nor the names of its contributors
+	may be used to endorse or promote products derived from this software
+	without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.

+ 235 - 0
modules/theoraplayer/native/OpenAL_AudioInterface.cpp

@@ -0,0 +1,235 @@
+/// @file
+/// @version 2.0
+/// 
+/// @section LICENSE
+/// 
+/// This program is free software; you can redistribute it and/or modify it under
+/// the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
+
+#include <stdio.h>
+#include <string.h>
+
+#include "OpenAL_AudioInterface.h"
+
+ALCdevice* gDevice = 0;
+ALCcontext* gContext = 0;
+
+short float2short(float f)
+{
+	if (f > 1.0f)
+	{
+		f = 1.0f;
+	}
+	else if (f < -1.0f)
+	{
+		f = -1.0f;
+	}
+	return (short)(f * 32767);
+}
+
+OpenAL_AudioInterface::OpenAL_AudioInterface(theoraplayer::VideoClip* clip, int channelsCount, int frequency) :
+	theoraplayer::AudioInterface(clip, channelsCount, frequency), theoraplayer::Timer()
+{
+	this->sourceNumChannels = this->channelsCount;
+	if (this->channelsCount > 2)
+	{
+		// ignore audio with more than 2 channels, use only the stereo channels
+		this->channelsCount = 2;
+	}
+	this->maxBuffSize = frequency * this->channelsCount * 2;
+	this->buffSize = 0;
+	this->numProcessedSamples = 0;
+	this->currentTimer = 0;
+	this->tempBuffer = new short[this->maxBuffSize];
+	alGenSources(1, &this->source);
+	clip->setTimer(this);
+	this->numPlayedSamples = 0;
+}
+
+OpenAL_AudioInterface::~OpenAL_AudioInterface()
+{
+	if (this->tempBuffer != NULL)
+	{
+		delete[] this->tempBuffer;
+	}
+	if (this->source != 0)
+	{
+		alSourcei(this->source, AL_BUFFER, 0);
+		alDeleteSources(1, &this->source);
+	}
+	while (this->bufferQueue.size() > 0)
+	{
+		alDeleteBuffers(1, &this->bufferQueue.front().id);
+		this->bufferQueue.pop();
+	}
+}
+
+float OpenAL_AudioInterface::getQueuedAudioSize()
+{
+	return ((float)(this->numProcessedSamples - this->numPlayedSamples)) / this->frequency;
+}
+
+void OpenAL_AudioInterface::insertData(float* data, int samplesCount)
+{
+	float* tempData = NULL;
+	if (this->sourceNumChannels > 2)
+	{
+		tempData = new float[samplesCount * 2 / this->sourceNumChannels + 16]; // 16 padding just in case
+		int i = 0;
+		int n = 0;
+		for (n = 0, i = 0; i < samplesCount; i += this->sourceNumChannels, n += 2)
+		{
+			tempData[n] = data[i];
+			tempData[n + 1] = data[i + 1];
+		}
+		data = tempData;
+		samplesCount = n;
+	}
+	//printf("got %d bytes, %d buffers queued\n",samplesCount,(int)this->bufferQueue.size());
+	int state = 0;
+	OpenAL_Buffer buff;
+	ALuint format;
+	for (int i = 0; i < samplesCount; ++i)
+	{
+		if (this->buffSize < this->maxBuffSize)
+		{
+			this->tempBuffer[this->buffSize] = float2short(data[i]);
+			++this->buffSize;
+		}
+		if (this->buffSize == this->frequency * this->channelsCount / 10)
+		{
+			memset(&buff, 0, sizeof(OpenAL_Buffer));
+			alGenBuffers(1, &buff.id);
+			format = (this->channelsCount == 1) ? AL_FORMAT_MONO16 : AL_FORMAT_STEREO16;
+			alBufferData(buff.id, format, this->tempBuffer, this->buffSize * 2, this->frequency);
+			alSourceQueueBuffers(this->source, 1, &buff.id);
+			buff.samplesCount = this->buffSize / this->channelsCount;
+			this->numProcessedSamples += this->buffSize / this->channelsCount;
+			this->bufferQueue.push(buff);
+			this->buffSize = 0;
+			state = 0;
+			alGetSourcei(this->source, AL_SOURCE_STATE, &state);
+			if (state != AL_PLAYING)
+			{
+				//alSourcef(this->source,AL_PITCH,0.5); // debug
+				//alSourcef(this->source,AL_SAMPLE_OFFSET,(float) this->numProcessedSamples-mFreq/4);
+				alSourcePlay(this->source);
+			}
+		}
+	}
+	if (tempData != NULL)
+	{
+		delete[] tempData;
+	}
+}
+
+void OpenAL_AudioInterface::update(float timeDelta)
+{
+	int i = 0;
+	int nProcessed = 0;
+	OpenAL_Buffer buff;
+	// process played buffers
+	alGetSourcei(this->source, AL_BUFFERS_PROCESSED, &nProcessed);
+	for (i = 0; i < nProcessed; ++i)
+	{
+		buff = this->bufferQueue.front();
+		this->bufferQueue.pop();
+		this->numPlayedSamples += buff.samplesCount;
+		alSourceUnqueueBuffers(this->source, 1, &buff.id);
+		alDeleteBuffers(1, &buff.id);
+	}
+	if (nProcessed != 0)
+	{
+		// update offset
+		alGetSourcef(this->source, AL_SEC_OFFSET, &this->currentTimer);
+	}
+	// control playback and return time position
+	//alGetSourcei(this->source,AL_SOURCE_STATE,&state);
+	//if (state == AL_PLAYING)
+	this->currentTimer += timeDelta;
+	this->time = this->currentTimer + (float) this->numPlayedSamples / this->frequency;
+	float duration = this->clip->getDuration();
+	if (this->time > duration)
+	{
+		this->time = duration;
+	}
+}
+
+void OpenAL_AudioInterface::pause()
+{
+	alSourcePause(this->source);
+	Timer::pause();
+}
+
+void OpenAL_AudioInterface::play()
+{
+	alSourcePlay(this->source);
+	Timer::play();
+}
+
+void OpenAL_AudioInterface::seek(float time)
+{
+	OpenAL_Buffer buff;
+	alSourceStop(this->source);
+	while (!this->bufferQueue.empty())
+	{
+		buff = this->bufferQueue.front();
+		this->bufferQueue.pop();
+		alSourceUnqueueBuffers(this->source, 1, &buff.id);
+		alDeleteBuffers(1, &buff.id);
+	}
+	//		int nProcessed;
+	//		alGetSourcei(this->source,AL_BUFFERS_PROCESSED,&nProcessed);
+	//		if (nProcessed != 0)
+	//			nProcessed=nProcessed;
+	this->buffSize = 0;
+	this->currentTimer = 0;
+	this->numPlayedSamples = this->numProcessedSamples = (int)(time * this->frequency);
+	this->time = time;
+}
+
+OpenAL_AudioInterfaceFactory::OpenAL_AudioInterfaceFactory()
+{
+	return;
+	
+	// openal init is here used only to simplify samples for this plugin
+	// if you want to use this interface in your own program, you'll
+	// probably want to remove the openal init/destory lines
+	gDevice = alcOpenDevice(NULL);
+	if (alcGetError(gDevice) != ALC_NO_ERROR)
+	{
+		return;
+	}
+	gContext = alcCreateContext(gDevice, NULL);
+	if (alcGetError(gDevice) != ALC_NO_ERROR)
+	{
+		alcCloseDevice(gDevice);
+		gDevice = NULL;
+		return;
+	}
+	alcMakeContextCurrent(gContext);
+	if (alcGetError(gDevice) != ALC_NO_ERROR)
+	{
+		alcDestroyContext(gContext);
+		gContext = NULL;
+		alcCloseDevice(gDevice);
+		gDevice = NULL;
+	}
+}
+
+OpenAL_AudioInterfaceFactory::~OpenAL_AudioInterfaceFactory()
+{
+	return;
+	
+	if (gDevice != NULL)
+	{
+		alcMakeContextCurrent(NULL);
+		alcDestroyContext(gContext);
+		alcCloseDevice(gDevice);
+	}
+}
+
+OpenAL_AudioInterface* OpenAL_AudioInterfaceFactory::createInstance(theoraplayer::VideoClip* clip, int channelsCount, int frequency)
+{
+	return new OpenAL_AudioInterface(clip, channelsCount, frequency);
+}

+ 77 - 0
modules/theoraplayer/native/OpenAL_AudioInterface.h

@@ -0,0 +1,77 @@
+/// @file
+/// @version 2.0
+/// 
+/// @section LICENSE
+/// 
+/// This program is free software; you can redistribute it and/or modify it under
+/// the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
+/// 
+/// @section DESCRIPTION
+/// 
+/// Defines an audio interface for OpenAL.
+
+#ifndef THEORAPLAYER_DEMOS_OPENAL_AUDIO_INTERFACE_H
+#define THEORAPLAYER_DEMOS_OPENAL_AUDIO_INTERFACE_H
+
+#ifndef __APPLE__
+#include <AL/al.h>
+#include <AL/alc.h>
+#else
+#include <OpenAL/al.h>
+#include <OpenAL/alc.h>
+#endif
+#include <queue>
+
+#include <theoraplayer/AudioInterface.h>
+#include <theoraplayer/AudioInterfaceFactory.h>
+#include <theoraplayer/Timer.h>
+#include <theoraplayer/VideoClip.h>
+
+class OpenAL_AudioInterface : public theoraplayer::AudioInterface, theoraplayer::Timer
+{	
+public:
+	OpenAL_AudioInterface(theoraplayer::VideoClip* clip, int channelsCount, int frequency);
+	~OpenAL_AudioInterface();
+
+	//! queued audio buffers, expressed in seconds
+	float getQueuedAudioSize();
+
+	void insertData(float* data, int samplesCount);	
+
+	void update(float timeDelta);
+
+	void pause();
+	void play();
+	void seek(float time);
+
+private:
+	int sourceNumChannels;
+	int maxBuffSize;
+	int buffSize;
+	short* tempBuffer;
+	float currentTimer;
+
+	struct OpenAL_Buffer
+	{
+		ALuint id;
+		int samplesCount;
+	};
+	std::queue<OpenAL_Buffer> bufferQueue;
+
+	ALuint source;
+	int numProcessedSamples;
+	int numPlayedSamples;
+
+};
+
+class OpenAL_AudioInterfaceFactory : public theoraplayer::AudioInterfaceFactory
+{
+public:
+	OpenAL_AudioInterfaceFactory();
+	~OpenAL_AudioInterfaceFactory();
+
+	OpenAL_AudioInterface* createInstance(theoraplayer::VideoClip* clip, int channelsCount, int frequency);
+
+};
+
+#endif

+ 33 - 0
modules/theoraplayer/native/monkey2_glue.cpp

@@ -0,0 +1,33 @@
+
+#include "monkey2_glue.h"
+
+#include "theoraplayer.h"
+#include "Manager.h"
+#include "MemoryDataSource.h"
+
+theoraplayer::Manager *bb_theoraplayer_getManager(){
+
+	if( !theoraplayer::manager ) theoraplayer::init();
+	
+	return theoraplayer::manager;
+}
+
+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const char *filename ){
+
+	return self->createVideoClip( filename );
+}
+
+//FIXME - leaks MemoryDataSource!
+//
+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const void *data,int length ){
+
+	theoraplayer::MemoryDataSource *src=new theoraplayer::MemoryDataSource( (unsigned char*)data,length,"Theora" );
+	
+	return self->createVideoClip( src );
+}
+
+theoraplayer::MemoryDataSource *bb_theoraplayer_createMemoryDataSource( const void *data,int length,const char *formatName ){
+
+	return new theoraplayer::MemoryDataSource( (unsigned char*)data,length,formatName );
+}
+

+ 21 - 0
modules/theoraplayer/native/monkey2_glue.h

@@ -0,0 +1,21 @@
+
+#ifndef BB_THEORAPLAYER_GLUE_H
+#define BB_THEORAPLAYER_GLUE_H
+
+#include <bbmonkey.h>
+
+namespace theoraplayer{
+	class Manager;
+	class VideoClip;
+	class MemoryDataSource;
+}
+
+theoraplayer::Manager *bb_theoraplayer_getManager();
+
+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const char *filename );
+
+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const void *data,int length );
+
+theoraplayer::MemoryDataSource *bb_theoraplayer_createMemoryDataSource( const void *data,int length,const char *formatName );
+
+#endif

+ 28 - 0
modules/theoraplayer/native/ogg/COPYING

@@ -0,0 +1,28 @@
+Copyright (c) 2002, Xiph.org Foundation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the Xiph.org Foundation nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 11 - 0
modules/theoraplayer/native/ogg/include/ogg/config_types.h

@@ -0,0 +1,11 @@
+#ifndef __CONFIG_TYPES_H__
+#define __CONFIG_TYPES_H__
+
+/* these are filled in by configure */
+typedef short ogg_int16_t;
+typedef unsigned short ogg_uint16_t;
+typedef int ogg_int32_t;
+typedef unsigned int ogg_uint32_t;
+typedef long long ogg_int64_t;
+
+#endif

+ 25 - 0
modules/theoraplayer/native/ogg/include/ogg/config_types.h.in

@@ -0,0 +1,25 @@
+#ifndef __CONFIG_TYPES_H__
+#define __CONFIG_TYPES_H__
+
+/* these are filled in by configure */
+#define INCLUDE_INTTYPES_H @INCLUDE_INTTYPES_H@
+#define INCLUDE_STDINT_H @INCLUDE_STDINT_H@
+#define INCLUDE_SYS_TYPES_H @INCLUDE_SYS_TYPES_H@
+
+#if INCLUDE_INTTYPES_H
+#  include <inttypes.h>
+#endif
+#if INCLUDE_STDINT_H
+#  include <stdint.h>
+#endif
+#if INCLUDE_SYS_TYPES_H
+#  include <sys/types.h>
+#endif
+
+typedef @SIZE16@ ogg_int16_t;
+typedef @USIZE16@ ogg_uint16_t;
+typedef @SIZE32@ ogg_int32_t;
+typedef @USIZE32@ ogg_uint32_t;
+typedef @SIZE64@ ogg_int64_t;
+
+#endif

+ 210 - 0
modules/theoraplayer/native/ogg/include/ogg/ogg.h

@@ -0,0 +1,210 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+ function: toplevel libogg include
+ last mod: $Id: ogg.h 18044 2011-08-01 17:55:20Z gmaxwell $
+
+ ********************************************************************/
+#ifndef _OGG_H
+#define _OGG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <ogg/os_types.h>
+
+typedef struct {
+  void *iov_base;
+  size_t iov_len;
+} ogg_iovec_t;
+
+typedef struct {
+  long endbyte;
+  int  endbit;
+
+  unsigned char *buffer;
+  unsigned char *ptr;
+  long storage;
+} oggpack_buffer;
+
+/* ogg_page is used to encapsulate the data in one Ogg bitstream page *****/
+
+typedef struct {
+  unsigned char *header;
+  long header_len;
+  unsigned char *body;
+  long body_len;
+} ogg_page;
+
+/* ogg_stream_state contains the current encode/decode state of a logical
+   Ogg bitstream **********************************************************/
+
+typedef struct {
+  unsigned char   *body_data;    /* bytes from packet bodies */
+  long    body_storage;          /* storage elements allocated */
+  long    body_fill;             /* elements stored; fill mark */
+  long    body_returned;         /* elements of fill returned */
+
+
+  int     *lacing_vals;      /* The values that will go to the segment table */
+  ogg_int64_t *granule_vals; /* granulepos values for headers. Not compact
+                                this way, but it is simple coupled to the
+                                lacing fifo */
+  long    lacing_storage;
+  long    lacing_fill;
+  long    lacing_packet;
+  long    lacing_returned;
+
+  unsigned char    header[282];      /* working space for header encode */
+  int              header_fill;
+
+  int     e_o_s;          /* set when we have buffered the last packet in the
+                             logical bitstream */
+  int     b_o_s;          /* set after we've written the initial page
+                             of a logical bitstream */
+  long    serialno;
+  long    pageno;
+  ogg_int64_t  packetno;  /* sequence number for decode; the framing
+                             knows where there's a hole in the data,
+                             but we need coupling so that the codec
+                             (which is in a separate abstraction
+                             layer) also knows about the gap */
+  ogg_int64_t   granulepos;
+
+} ogg_stream_state;
+
+/* ogg_packet is used to encapsulate the data and metadata belonging
+   to a single raw Ogg/Vorbis packet *************************************/
+
+typedef struct {
+  unsigned char *packet;
+  long  bytes;
+  long  b_o_s;
+  long  e_o_s;
+
+  ogg_int64_t  granulepos;
+
+  ogg_int64_t  packetno;     /* sequence number for decode; the framing
+                                knows where there's a hole in the data,
+                                but we need coupling so that the codec
+                                (which is in a separate abstraction
+                                layer) also knows about the gap */
+} ogg_packet;
+
+typedef struct {
+  unsigned char *data;
+  int storage;
+  int fill;
+  int returned;
+
+  int unsynced;
+  int headerbytes;
+  int bodybytes;
+} ogg_sync_state;
+
+/* Ogg BITSTREAM PRIMITIVES: bitstream ************************/
+
+extern void  oggpack_writeinit(oggpack_buffer *b);
+extern int   oggpack_writecheck(oggpack_buffer *b);
+extern void  oggpack_writetrunc(oggpack_buffer *b,long bits);
+extern void  oggpack_writealign(oggpack_buffer *b);
+extern void  oggpack_writecopy(oggpack_buffer *b,void *source,long bits);
+extern void  oggpack_reset(oggpack_buffer *b);
+extern void  oggpack_writeclear(oggpack_buffer *b);
+extern void  oggpack_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
+extern void  oggpack_write(oggpack_buffer *b,unsigned long value,int bits);
+extern long  oggpack_look(oggpack_buffer *b,int bits);
+extern long  oggpack_look1(oggpack_buffer *b);
+extern void  oggpack_adv(oggpack_buffer *b,int bits);
+extern void  oggpack_adv1(oggpack_buffer *b);
+extern long  oggpack_read(oggpack_buffer *b,int bits);
+extern long  oggpack_read1(oggpack_buffer *b);
+extern long  oggpack_bytes(oggpack_buffer *b);
+extern long  oggpack_bits(oggpack_buffer *b);
+extern unsigned char *oggpack_get_buffer(oggpack_buffer *b);
+
+extern void  oggpackB_writeinit(oggpack_buffer *b);
+extern int   oggpackB_writecheck(oggpack_buffer *b);
+extern void  oggpackB_writetrunc(oggpack_buffer *b,long bits);
+extern void  oggpackB_writealign(oggpack_buffer *b);
+extern void  oggpackB_writecopy(oggpack_buffer *b,void *source,long bits);
+extern void  oggpackB_reset(oggpack_buffer *b);
+extern void  oggpackB_writeclear(oggpack_buffer *b);
+extern void  oggpackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
+extern void  oggpackB_write(oggpack_buffer *b,unsigned long value,int bits);
+extern long  oggpackB_look(oggpack_buffer *b,int bits);
+extern long  oggpackB_look1(oggpack_buffer *b);
+extern void  oggpackB_adv(oggpack_buffer *b,int bits);
+extern void  oggpackB_adv1(oggpack_buffer *b);
+extern long  oggpackB_read(oggpack_buffer *b,int bits);
+extern long  oggpackB_read1(oggpack_buffer *b);
+extern long  oggpackB_bytes(oggpack_buffer *b);
+extern long  oggpackB_bits(oggpack_buffer *b);
+extern unsigned char *oggpackB_get_buffer(oggpack_buffer *b);
+
+/* Ogg BITSTREAM PRIMITIVES: encoding **************************/
+
+extern int      ogg_stream_packetin(ogg_stream_state *os, ogg_packet *op);
+extern int      ogg_stream_iovecin(ogg_stream_state *os, ogg_iovec_t *iov,
+                                   int count, long e_o_s, ogg_int64_t granulepos);
+extern int      ogg_stream_pageout(ogg_stream_state *os, ogg_page *og);
+extern int      ogg_stream_pageout_fill(ogg_stream_state *os, ogg_page *og, int nfill);
+extern int      ogg_stream_flush(ogg_stream_state *os, ogg_page *og);
+extern int      ogg_stream_flush_fill(ogg_stream_state *os, ogg_page *og, int nfill);
+
+/* Ogg BITSTREAM PRIMITIVES: decoding **************************/
+
+extern int      ogg_sync_init(ogg_sync_state *oy);
+extern int      ogg_sync_clear(ogg_sync_state *oy);
+extern int      ogg_sync_reset(ogg_sync_state *oy);
+extern int      ogg_sync_destroy(ogg_sync_state *oy);
+extern int      ogg_sync_check(ogg_sync_state *oy);
+
+extern char    *ogg_sync_buffer(ogg_sync_state *oy, long size);
+extern int      ogg_sync_wrote(ogg_sync_state *oy, long bytes);
+extern long     ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og);
+extern int      ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og);
+extern int      ogg_stream_pagein(ogg_stream_state *os, ogg_page *og);
+extern int      ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op);
+extern int      ogg_stream_packetpeek(ogg_stream_state *os,ogg_packet *op);
+
+/* Ogg BITSTREAM PRIMITIVES: general ***************************/
+
+extern int      ogg_stream_init(ogg_stream_state *os,int serialno);
+extern int      ogg_stream_clear(ogg_stream_state *os);
+extern int      ogg_stream_reset(ogg_stream_state *os);
+extern int      ogg_stream_reset_serialno(ogg_stream_state *os,int serialno);
+extern int      ogg_stream_destroy(ogg_stream_state *os);
+extern int      ogg_stream_check(ogg_stream_state *os);
+extern int      ogg_stream_eos(ogg_stream_state *os);
+
+extern void     ogg_page_checksum_set(ogg_page *og);
+
+extern int      ogg_page_version(const ogg_page *og);
+extern int      ogg_page_continued(const ogg_page *og);
+extern int      ogg_page_bos(const ogg_page *og);
+extern int      ogg_page_eos(const ogg_page *og);
+extern ogg_int64_t  ogg_page_granulepos(const ogg_page *og);
+extern int      ogg_page_serialno(const ogg_page *og);
+extern long     ogg_page_pageno(const ogg_page *og);
+extern int      ogg_page_packets(const ogg_page *og);
+
+extern void     ogg_packet_clear(ogg_packet *op);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* _OGG_H */

+ 147 - 0
modules/theoraplayer/native/ogg/include/ogg/os_types.h

@@ -0,0 +1,147 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2002             *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+ function: #ifdef jail to whip a few platforms into the UNIX ideal.
+ last mod: $Id: os_types.h 17712 2010-12-03 17:10:02Z xiphmont $
+
+ ********************************************************************/
+#ifndef _OS_TYPES_H
+#define _OS_TYPES_H
+
+/* make it easy on the folks that want to compile the libs with a
+   different malloc than stdlib */
+#define _ogg_malloc  malloc
+#define _ogg_calloc  calloc
+#define _ogg_realloc realloc
+#define _ogg_free    free
+
+#if defined(_WIN32) 
+
+#  if defined(__CYGWIN__)
+#    include <stdint.h>
+     typedef int16_t ogg_int16_t;
+     typedef uint16_t ogg_uint16_t;
+     typedef int32_t ogg_int32_t;
+     typedef uint32_t ogg_uint32_t;
+     typedef int64_t ogg_int64_t;
+     typedef uint64_t ogg_uint64_t;
+#  elif defined(__MINGW32__)
+#    include <sys/types.h>
+     typedef short ogg_int16_t;
+     typedef unsigned short ogg_uint16_t;
+     typedef int ogg_int32_t;
+     typedef unsigned int ogg_uint32_t;
+     typedef long long ogg_int64_t;
+     typedef unsigned long long ogg_uint64_t;
+#  elif defined(__MWERKS__)
+     typedef long long ogg_int64_t;
+     typedef int ogg_int32_t;
+     typedef unsigned int ogg_uint32_t;
+     typedef short ogg_int16_t;
+     typedef unsigned short ogg_uint16_t;
+#  else
+     /* MSVC/Borland */
+     typedef __int64 ogg_int64_t;
+     typedef __int32 ogg_int32_t;
+     typedef unsigned __int32 ogg_uint32_t;
+     typedef __int16 ogg_int16_t;
+     typedef unsigned __int16 ogg_uint16_t;
+#  endif
+
+#elif defined(__MACOS__)
+
+#  include <sys/types.h>
+   typedef SInt16 ogg_int16_t;
+   typedef UInt16 ogg_uint16_t;
+   typedef SInt32 ogg_int32_t;
+   typedef UInt32 ogg_uint32_t;
+   typedef SInt64 ogg_int64_t;
+
+#elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
+
+#  include <inttypes.h>
+   typedef int16_t ogg_int16_t;
+   typedef uint16_t ogg_uint16_t;
+   typedef int32_t ogg_int32_t;
+   typedef uint32_t ogg_uint32_t;
+   typedef int64_t ogg_int64_t;
+
+#elif defined(__HAIKU__)
+
+  /* Haiku */
+#  include <sys/types.h>
+   typedef short ogg_int16_t;
+   typedef unsigned short ogg_uint16_t;
+   typedef int ogg_int32_t;
+   typedef unsigned int ogg_uint32_t;
+   typedef long long ogg_int64_t;
+
+#elif defined(__BEOS__)
+
+   /* Be */
+#  include <inttypes.h>
+   typedef int16_t ogg_int16_t;
+   typedef uint16_t ogg_uint16_t;
+   typedef int32_t ogg_int32_t;
+   typedef uint32_t ogg_uint32_t;
+   typedef int64_t ogg_int64_t;
+
+#elif defined (__EMX__)
+
+   /* OS/2 GCC */
+   typedef short ogg_int16_t;
+   typedef unsigned short ogg_uint16_t;
+   typedef int ogg_int32_t;
+   typedef unsigned int ogg_uint32_t;
+   typedef long long ogg_int64_t;
+
+#elif defined (DJGPP)
+
+   /* DJGPP */
+   typedef short ogg_int16_t;
+   typedef int ogg_int32_t;
+   typedef unsigned int ogg_uint32_t;
+   typedef long long ogg_int64_t;
+
+#elif defined(R5900)
+
+   /* PS2 EE */
+   typedef long ogg_int64_t;
+   typedef int ogg_int32_t;
+   typedef unsigned ogg_uint32_t;
+   typedef short ogg_int16_t;
+
+#elif defined(__SYMBIAN32__)
+
+   /* Symbian GCC */
+   typedef signed short ogg_int16_t;
+   typedef unsigned short ogg_uint16_t;
+   typedef signed int ogg_int32_t;
+   typedef unsigned int ogg_uint32_t;
+   typedef long long int ogg_int64_t;
+
+#elif defined(__TMS320C6X__)
+
+   /* TI C64x compiler */
+   typedef signed short ogg_int16_t;
+   typedef unsigned short ogg_uint16_t;
+   typedef signed int ogg_int32_t;
+   typedef unsigned int ogg_uint32_t;
+   typedef long long int ogg_int64_t;
+
+#else
+
+#  include <ogg/config_types.h>
+
+#endif
+
+#endif  /* _OS_TYPES_H */

+ 15 - 0
modules/theoraplayer/native/ogg/libtheoraplayer-readme.txt

@@ -0,0 +1,15 @@
+libogg's source code is here provided in minimalist distribution format
+with all source files not needed for compiling libtheoraplayer removed.
+
+- The project files were modified to fit libtheoraplayer's binary output
+  folder structure.
+- Some project files missing in the original source distibution were added to support
+  compiling the libtheoraplayer on those platforms.
+- Also, some code may have been changed to address certain compiler/platform
+  specific problems and is so indicated in the source code.
+
+libogg is owned and maintained by the Xiph.Org foundation and this distribution
+is present here only for convenience and easier compilation of libtheoraplayer.
+
+If you want to use libogg outside of libtheoraplayer, it is encouraged to use the
+original source distribution by Xiph: http://xiph.org/

+ 857 - 0
modules/theoraplayer/native/ogg/src/bitwise.c

@@ -0,0 +1,857 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE.              *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitwise.c 18051 2011-08-04 17:56:39Z giles $
+
+ ********************************************************************/
+
+/* We're 'LSb' endian; if we write a word but read individual bits,
+   then we'll read the lsb first */
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <ogg/ogg.h>
+
+#define BUFFER_INCREMENT 256
+
+static const unsigned long mask[]=
+{0x00000000,0x00000001,0x00000003,0x00000007,0x0000000f,
+ 0x0000001f,0x0000003f,0x0000007f,0x000000ff,0x000001ff,
+ 0x000003ff,0x000007ff,0x00000fff,0x00001fff,0x00003fff,
+ 0x00007fff,0x0000ffff,0x0001ffff,0x0003ffff,0x0007ffff,
+ 0x000fffff,0x001fffff,0x003fffff,0x007fffff,0x00ffffff,
+ 0x01ffffff,0x03ffffff,0x07ffffff,0x0fffffff,0x1fffffff,
+ 0x3fffffff,0x7fffffff,0xffffffff };
+
+static const unsigned int mask8B[]=
+{0x00,0x80,0xc0,0xe0,0xf0,0xf8,0xfc,0xfe,0xff};
+
+void oggpack_writeinit(oggpack_buffer *b){
+  memset(b,0,sizeof(*b));
+  b->ptr=b->buffer=_ogg_malloc(BUFFER_INCREMENT);
+  b->buffer[0]='\0';
+  b->storage=BUFFER_INCREMENT;
+}
+
+void oggpackB_writeinit(oggpack_buffer *b){
+  oggpack_writeinit(b);
+}
+
+int oggpack_writecheck(oggpack_buffer *b){
+  if(!b->ptr || !b->storage)return -1;
+  return 0;
+}
+
+int oggpackB_writecheck(oggpack_buffer *b){
+  return oggpack_writecheck(b);
+}
+
+void oggpack_writetrunc(oggpack_buffer *b,long bits){
+  long bytes=bits>>3;
+  if(b->ptr){
+    bits-=bytes*8;
+    b->ptr=b->buffer+bytes;
+    b->endbit=bits;
+    b->endbyte=bytes;
+    *b->ptr&=mask[bits];
+  }
+}
+
+void oggpackB_writetrunc(oggpack_buffer *b,long bits){
+  long bytes=bits>>3;
+  if(b->ptr){
+    bits-=bytes*8;
+    b->ptr=b->buffer+bytes;
+    b->endbit=bits;
+    b->endbyte=bytes;
+    *b->ptr&=mask8B[bits];
+  }
+}
+
+/* Takes only up to 32 bits. */
+void oggpack_write(oggpack_buffer *b,unsigned long value,int bits){
+  if(bits<0 || bits>32) goto err;
+  if(b->endbyte>=b->storage-4){
+    void *ret;
+    if(!b->ptr)return;
+    if(b->storage>LONG_MAX-BUFFER_INCREMENT) goto err;
+    ret=_ogg_realloc(b->buffer,b->storage+BUFFER_INCREMENT);
+    if(!ret) goto err;
+    b->buffer=ret;
+    b->storage+=BUFFER_INCREMENT;
+    b->ptr=b->buffer+b->endbyte;
+  }
+
+  value&=mask[bits];
+  bits+=b->endbit;
+
+  b->ptr[0]|=value<<b->endbit;
+
+  if(bits>=8){
+    b->ptr[1]=(unsigned char)(value>>(8-b->endbit));
+    if(bits>=16){
+      b->ptr[2]=(unsigned char)(value>>(16-b->endbit));
+      if(bits>=24){
+        b->ptr[3]=(unsigned char)(value>>(24-b->endbit));
+        if(bits>=32){
+          if(b->endbit)
+            b->ptr[4]=(unsigned char)(value>>(32-b->endbit));
+          else
+            b->ptr[4]=0;
+        }
+      }
+    }
+  }
+
+  b->endbyte+=bits/8;
+  b->ptr+=bits/8;
+  b->endbit=bits&7;
+  return;
+ err:
+  oggpack_writeclear(b);
+}
+
+/* Takes only up to 32 bits. */
+void oggpackB_write(oggpack_buffer *b,unsigned long value,int bits){
+  if(bits<0 || bits>32) goto err;
+  if(b->endbyte>=b->storage-4){
+    void *ret;
+    if(!b->ptr)return;
+    if(b->storage>LONG_MAX-BUFFER_INCREMENT) goto err;
+    ret=_ogg_realloc(b->buffer,b->storage+BUFFER_INCREMENT);
+    if(!ret) goto err;
+    b->buffer=ret;
+    b->storage+=BUFFER_INCREMENT;
+    b->ptr=b->buffer+b->endbyte;
+  }
+
+  value=(value&mask[bits])<<(32-bits);
+  bits+=b->endbit;
+
+  b->ptr[0]|=value>>(24+b->endbit);
+
+  if(bits>=8){
+    b->ptr[1]=(unsigned char)(value>>(16+b->endbit));
+    if(bits>=16){
+      b->ptr[2]=(unsigned char)(value>>(8+b->endbit));
+      if(bits>=24){
+        b->ptr[3]=(unsigned char)(value>>(b->endbit));
+        if(bits>=32){
+          if(b->endbit)
+            b->ptr[4]=(unsigned char)(value<<(8-b->endbit));
+          else
+            b->ptr[4]=0;
+        }
+      }
+    }
+  }
+
+  b->endbyte+=bits/8;
+  b->ptr+=bits/8;
+  b->endbit=bits&7;
+  return;
+ err:
+  oggpack_writeclear(b);
+}
+
+void oggpack_writealign(oggpack_buffer *b){
+  int bits=8-b->endbit;
+  if(bits<8)
+    oggpack_write(b,0,bits);
+}
+
+void oggpackB_writealign(oggpack_buffer *b){
+  int bits=8-b->endbit;
+  if(bits<8)
+    oggpackB_write(b,0,bits);
+}
+
+static void oggpack_writecopy_helper(oggpack_buffer *b,
+                                     void *source,
+                                     long bits,
+                                     void (*w)(oggpack_buffer *,
+                                               unsigned long,
+                                               int),
+                                     int msb){
+  unsigned char *ptr=(unsigned char *)source;
+
+  long bytes=bits/8;
+  bits-=bytes*8;
+
+  if(b->endbit){
+    int i;
+    /* unaligned copy.  Do it the hard way. */
+    for(i=0;i<bytes;i++)
+      w(b,(unsigned long)(ptr[i]),8);
+  }else{
+    /* aligned block copy */
+    if(b->endbyte+bytes+1>=b->storage){
+      void *ret;
+      if(!b->ptr) goto err;
+      if(b->endbyte+bytes+BUFFER_INCREMENT>b->storage) goto err;
+      b->storage=b->endbyte+bytes+BUFFER_INCREMENT;
+      ret=_ogg_realloc(b->buffer,b->storage);
+      if(!ret) goto err;
+      b->buffer=ret;
+      b->ptr=b->buffer+b->endbyte;
+    }
+
+    memmove(b->ptr,source,bytes);
+    b->ptr+=bytes;
+    b->endbyte+=bytes;
+    *b->ptr=0;
+
+  }
+  if(bits){
+    if(msb)
+      w(b,(unsigned long)(ptr[bytes]>>(8-bits)),bits);
+    else
+      w(b,(unsigned long)(ptr[bytes]),bits);
+  }
+  return;
+ err:
+  oggpack_writeclear(b);
+}
+
+void oggpack_writecopy(oggpack_buffer *b,void *source,long bits){
+  oggpack_writecopy_helper(b,source,bits,oggpack_write,0);
+}
+
+void oggpackB_writecopy(oggpack_buffer *b,void *source,long bits){
+  oggpack_writecopy_helper(b,source,bits,oggpackB_write,1);
+}
+
+void oggpack_reset(oggpack_buffer *b){
+  if(!b->ptr)return;
+  b->ptr=b->buffer;
+  b->buffer[0]=0;
+  b->endbit=b->endbyte=0;
+}
+
+void oggpackB_reset(oggpack_buffer *b){
+  oggpack_reset(b);
+}
+
+void oggpack_writeclear(oggpack_buffer *b){
+  if(b->buffer)_ogg_free(b->buffer);
+  memset(b,0,sizeof(*b));
+}
+
+void oggpackB_writeclear(oggpack_buffer *b){
+  oggpack_writeclear(b);
+}
+
+void oggpack_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
+  memset(b,0,sizeof(*b));
+  b->buffer=b->ptr=buf;
+  b->storage=bytes;
+}
+
+void oggpackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
+  oggpack_readinit(b,buf,bytes);
+}
+
+/* Read in bits without advancing the bitptr; bits <= 32 */
+long oggpack_look(oggpack_buffer *b,int bits){
+  unsigned long ret;
+  unsigned long m;
+
+  if(bits<0 || bits>32) return -1;
+  m=mask[bits];
+  bits+=b->endbit;
+
+  if(b->endbyte >= b->storage-4){
+    /* not the main path */
+    if(b->endbyte > b->storage-((bits+7)>>3)) return -1;
+    /* special case to avoid reading b->ptr[0], which might be past the end of
+        the buffer; also skips some useless accounting */
+    else if(!bits)return(0L);
+  }
+
+  ret=b->ptr[0]>>b->endbit;
+  if(bits>8){
+    ret|=b->ptr[1]<<(8-b->endbit);
+    if(bits>16){
+      ret|=b->ptr[2]<<(16-b->endbit);
+      if(bits>24){
+        ret|=b->ptr[3]<<(24-b->endbit);
+        if(bits>32 && b->endbit)
+          ret|=b->ptr[4]<<(32-b->endbit);
+      }
+    }
+  }
+  return(m&ret);
+}
+
+/* Read in bits without advancing the bitptr; bits <= 32 */
+long oggpackB_look(oggpack_buffer *b,int bits){
+  unsigned long ret;
+  int m=32-bits;
+
+  if(m<0 || m>32) return -1;
+  bits+=b->endbit;
+
+  if(b->endbyte >= b->storage-4){
+    /* not the main path */
+    if(b->endbyte > b->storage-((bits+7)>>3)) return -1;
+    /* special case to avoid reading b->ptr[0], which might be past the end of
+        the buffer; also skips some useless accounting */
+    else if(!bits)return(0L);
+  }
+
+  ret=b->ptr[0]<<(24+b->endbit);
+  if(bits>8){
+    ret|=b->ptr[1]<<(16+b->endbit);
+    if(bits>16){
+      ret|=b->ptr[2]<<(8+b->endbit);
+      if(bits>24){
+        ret|=b->ptr[3]<<(b->endbit);
+        if(bits>32 && b->endbit)
+          ret|=b->ptr[4]>>(8-b->endbit);
+      }
+    }
+  }
+  return ((ret&0xffffffff)>>(m>>1))>>((m+1)>>1);
+}
+
+long oggpack_look1(oggpack_buffer *b){
+  if(b->endbyte>=b->storage)return(-1);
+  return((b->ptr[0]>>b->endbit)&1);
+}
+
+long oggpackB_look1(oggpack_buffer *b){
+  if(b->endbyte>=b->storage)return(-1);
+  return((b->ptr[0]>>(7-b->endbit))&1);
+}
+
+void oggpack_adv(oggpack_buffer *b,int bits){
+  bits+=b->endbit;
+
+  if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
+
+  b->ptr+=bits/8;
+  b->endbyte+=bits/8;
+  b->endbit=bits&7;
+  return;
+
+ overflow:
+  b->ptr=NULL;
+  b->endbyte=b->storage;
+  b->endbit=1;
+}
+
+void oggpackB_adv(oggpack_buffer *b,int bits){
+  oggpack_adv(b,bits);
+}
+
+void oggpack_adv1(oggpack_buffer *b){
+  if(++(b->endbit)>7){
+    b->endbit=0;
+    b->ptr++;
+    b->endbyte++;
+  }
+}
+
+void oggpackB_adv1(oggpack_buffer *b){
+  oggpack_adv1(b);
+}
+
+/* bits <= 32 */
+long oggpack_read(oggpack_buffer *b,int bits){
+  long ret;
+  unsigned long m;
+
+  if(bits<0 || bits>32) goto err;
+  m=mask[bits];
+  bits+=b->endbit;
+
+  if(b->endbyte >= b->storage-4){
+    /* not the main path */
+    if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
+    /* special case to avoid reading b->ptr[0], which might be past the end of
+        the buffer; also skips some useless accounting */
+    else if(!bits)return(0L);
+  }
+
+  ret=b->ptr[0]>>b->endbit;
+  if(bits>8){
+    ret|=b->ptr[1]<<(8-b->endbit);
+    if(bits>16){
+      ret|=b->ptr[2]<<(16-b->endbit);
+      if(bits>24){
+        ret|=b->ptr[3]<<(24-b->endbit);
+        if(bits>32 && b->endbit){
+          ret|=b->ptr[4]<<(32-b->endbit);
+        }
+      }
+    }
+  }
+  ret&=m;
+  b->ptr+=bits/8;
+  b->endbyte+=bits/8;
+  b->endbit=bits&7;
+  return ret;
+
+ overflow:
+ err:
+  b->ptr=NULL;
+  b->endbyte=b->storage;
+  b->endbit=1;
+  return -1L;
+}
+
+/* bits <= 32 */
+long oggpackB_read(oggpack_buffer *b,int bits){
+  long ret;
+  long m=32-bits;
+
+  if(m<0 || m>32) goto err;
+  bits+=b->endbit;
+
+  if(b->endbyte+4>=b->storage){
+    /* not the main path */
+    if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
+    /* special case to avoid reading b->ptr[0], which might be past the end of
+        the buffer; also skips some useless accounting */
+    else if(!bits)return(0L);
+  }
+
+  ret=b->ptr[0]<<(24+b->endbit);
+  if(bits>8){
+    ret|=b->ptr[1]<<(16+b->endbit);
+    if(bits>16){
+      ret|=b->ptr[2]<<(8+b->endbit);
+      if(bits>24){
+        ret|=b->ptr[3]<<(b->endbit);
+        if(bits>32 && b->endbit)
+          ret|=b->ptr[4]>>(8-b->endbit);
+      }
+    }
+  }
+  ret=((ret&0xffffffffUL)>>(m>>1))>>((m+1)>>1);
+
+  b->ptr+=bits/8;
+  b->endbyte+=bits/8;
+  b->endbit=bits&7;
+  return ret;
+
+ overflow:
+ err:
+  b->ptr=NULL;
+  b->endbyte=b->storage;
+  b->endbit=1;
+  return -1L;
+}
+
+long oggpack_read1(oggpack_buffer *b){
+  long ret;
+
+  if(b->endbyte >= b->storage) goto overflow;
+  ret=(b->ptr[0]>>b->endbit)&1;
+
+  b->endbit++;
+  if(b->endbit>7){
+    b->endbit=0;
+    b->ptr++;
+    b->endbyte++;
+  }
+  return ret;
+
+ overflow:
+  b->ptr=NULL;
+  b->endbyte=b->storage;
+  b->endbit=1;
+  return -1L;
+}
+
+long oggpackB_read1(oggpack_buffer *b){
+  long ret;
+
+  if(b->endbyte >= b->storage) goto overflow;
+  ret=(b->ptr[0]>>(7-b->endbit))&1;
+
+  b->endbit++;
+  if(b->endbit>7){
+    b->endbit=0;
+    b->ptr++;
+    b->endbyte++;
+  }
+  return ret;
+
+ overflow:
+  b->ptr=NULL;
+  b->endbyte=b->storage;
+  b->endbit=1;
+  return -1L;
+}
+
+long oggpack_bytes(oggpack_buffer *b){
+  return(b->endbyte+(b->endbit+7)/8);
+}
+
+long oggpack_bits(oggpack_buffer *b){
+  return(b->endbyte*8+b->endbit);
+}
+
+long oggpackB_bytes(oggpack_buffer *b){
+  return oggpack_bytes(b);
+}
+
+long oggpackB_bits(oggpack_buffer *b){
+  return oggpack_bits(b);
+}
+
+unsigned char *oggpack_get_buffer(oggpack_buffer *b){
+  return(b->buffer);
+}
+
+unsigned char *oggpackB_get_buffer(oggpack_buffer *b){
+  return oggpack_get_buffer(b);
+}
+
+/* Self test of the bitwise routines; everything else is based on
+   them, so they damned well better be solid. */
+
+#ifdef _V_SELFTEST
+#include <stdio.h>
+
+static int ilog(unsigned int v){
+  int ret=0;
+  while(v){
+    ret++;
+    v>>=1;
+  }
+  return(ret);
+}
+
+oggpack_buffer o;
+oggpack_buffer r;
+
+void report(char *in){
+  fprintf(stderr,"%s",in);
+  exit(1);
+}
+
+void cliptest(unsigned long *b,int vals,int bits,int *comp,int compsize){
+  long bytes,i;
+  unsigned char *buffer;
+
+  oggpack_reset(&o);
+  for(i=0;i<vals;i++)
+    oggpack_write(&o,b[i],bits?bits:ilog(b[i]));
+  buffer=oggpack_get_buffer(&o);
+  bytes=oggpack_bytes(&o);
+  if(bytes!=compsize)report("wrong number of bytes!\n");
+  for(i=0;i<bytes;i++)if(buffer[i]!=comp[i]){
+    for(i=0;i<bytes;i++)fprintf(stderr,"%x %x\n",(int)buffer[i],(int)comp[i]);
+    report("wrote incorrect value!\n");
+  }
+  oggpack_readinit(&r,buffer,bytes);
+  for(i=0;i<vals;i++){
+    int tbit=bits?bits:ilog(b[i]);
+    if(oggpack_look(&r,tbit)==-1)
+      report("out of data!\n");
+    if(oggpack_look(&r,tbit)!=(b[i]&mask[tbit]))
+      report("looked at incorrect value!\n");
+    if(tbit==1)
+      if(oggpack_look1(&r)!=(b[i]&mask[tbit]))
+        report("looked at single bit incorrect value!\n");
+    if(tbit==1){
+      if(oggpack_read1(&r)!=(b[i]&mask[tbit]))
+        report("read incorrect single bit value!\n");
+    }else{
+    if(oggpack_read(&r,tbit)!=(b[i]&mask[tbit]))
+      report("read incorrect value!\n");
+    }
+  }
+  if(oggpack_bytes(&r)!=bytes)report("leftover bytes after read!\n");
+}
+
+void cliptestB(unsigned long *b,int vals,int bits,int *comp,int compsize){
+  long bytes,i;
+  unsigned char *buffer;
+
+  oggpackB_reset(&o);
+  for(i=0;i<vals;i++)
+    oggpackB_write(&o,b[i],bits?bits:ilog(b[i]));
+  buffer=oggpackB_get_buffer(&o);
+  bytes=oggpackB_bytes(&o);
+  if(bytes!=compsize)report("wrong number of bytes!\n");
+  for(i=0;i<bytes;i++)if(buffer[i]!=comp[i]){
+    for(i=0;i<bytes;i++)fprintf(stderr,"%x %x\n",(int)buffer[i],(int)comp[i]);
+    report("wrote incorrect value!\n");
+  }
+  oggpackB_readinit(&r,buffer,bytes);
+  for(i=0;i<vals;i++){
+    int tbit=bits?bits:ilog(b[i]);
+    if(oggpackB_look(&r,tbit)==-1)
+      report("out of data!\n");
+    if(oggpackB_look(&r,tbit)!=(b[i]&mask[tbit]))
+      report("looked at incorrect value!\n");
+    if(tbit==1)
+      if(oggpackB_look1(&r)!=(b[i]&mask[tbit]))
+        report("looked at single bit incorrect value!\n");
+    if(tbit==1){
+      if(oggpackB_read1(&r)!=(b[i]&mask[tbit]))
+        report("read incorrect single bit value!\n");
+    }else{
+    if(oggpackB_read(&r,tbit)!=(b[i]&mask[tbit]))
+      report("read incorrect value!\n");
+    }
+  }
+  if(oggpackB_bytes(&r)!=bytes)report("leftover bytes after read!\n");
+}
+
+int main(void){
+  unsigned char *buffer;
+  long bytes,i;
+  static unsigned long testbuffer1[]=
+    {18,12,103948,4325,543,76,432,52,3,65,4,56,32,42,34,21,1,23,32,546,456,7,
+       567,56,8,8,55,3,52,342,341,4,265,7,67,86,2199,21,7,1,5,1,4};
+  int test1size=43;
+
+  static unsigned long testbuffer2[]=
+    {216531625L,1237861823,56732452,131,3212421,12325343,34547562,12313212,
+       1233432,534,5,346435231,14436467,7869299,76326614,167548585,
+       85525151,0,12321,1,349528352};
+  int test2size=21;
+
+  static unsigned long testbuffer3[]=
+    {1,0,14,0,1,0,12,0,1,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,0,0,1,
+       0,1,30,1,1,1,0,0,1,0,0,0,12,0,11,0,1,0,0,1};
+  int test3size=56;
+
+  static unsigned long large[]=
+    {2136531625L,2137861823,56732452,131,3212421,12325343,34547562,12313212,
+       1233432,534,5,2146435231,14436467,7869299,76326614,167548585,
+       85525151,0,12321,1,2146528352};
+
+  int onesize=33;
+  static int one[33]={146,25,44,151,195,15,153,176,233,131,196,65,85,172,47,40,
+                    34,242,223,136,35,222,211,86,171,50,225,135,214,75,172,
+                    223,4};
+  static int oneB[33]={150,101,131,33,203,15,204,216,105,193,156,65,84,85,222,
+                       8,139,145,227,126,34,55,244,171,85,100,39,195,173,18,
+                       245,251,128};
+
+  int twosize=6;
+  static int two[6]={61,255,255,251,231,29};
+  static int twoB[6]={247,63,255,253,249,120};
+
+  int threesize=54;
+  static int three[54]={169,2,232,252,91,132,156,36,89,13,123,176,144,32,254,
+                      142,224,85,59,121,144,79,124,23,67,90,90,216,79,23,83,
+                      58,135,196,61,55,129,183,54,101,100,170,37,127,126,10,
+                      100,52,4,14,18,86,77,1};
+  static int threeB[54]={206,128,42,153,57,8,183,251,13,89,36,30,32,144,183,
+                         130,59,240,121,59,85,223,19,228,180,134,33,107,74,98,
+                         233,253,196,135,63,2,110,114,50,155,90,127,37,170,104,
+                         200,20,254,4,58,106,176,144,0};
+
+  int foursize=38;
+  static int four[38]={18,6,163,252,97,194,104,131,32,1,7,82,137,42,129,11,72,
+                     132,60,220,112,8,196,109,64,179,86,9,137,195,208,122,169,
+                     28,2,133,0,1};
+  static int fourB[38]={36,48,102,83,243,24,52,7,4,35,132,10,145,21,2,93,2,41,
+                        1,219,184,16,33,184,54,149,170,132,18,30,29,98,229,67,
+                        129,10,4,32};
+
+  int fivesize=45;
+  static int five[45]={169,2,126,139,144,172,30,4,80,72,240,59,130,218,73,62,
+                     241,24,210,44,4,20,0,248,116,49,135,100,110,130,181,169,
+                     84,75,159,2,1,0,132,192,8,0,0,18,22};
+  static int fiveB[45]={1,84,145,111,245,100,128,8,56,36,40,71,126,78,213,226,
+                        124,105,12,0,133,128,0,162,233,242,67,152,77,205,77,
+                        172,150,169,129,79,128,0,6,4,32,0,27,9,0};
+
+  int sixsize=7;
+  static int six[7]={17,177,170,242,169,19,148};
+  static int sixB[7]={136,141,85,79,149,200,41};
+
+  /* Test read/write together */
+  /* Later we test against pregenerated bitstreams */
+  oggpack_writeinit(&o);
+
+  fprintf(stderr,"\nSmall preclipped packing (LSb): ");
+  cliptest(testbuffer1,test1size,0,one,onesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nNull bit call (LSb): ");
+  cliptest(testbuffer3,test3size,0,two,twosize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nLarge preclipped packing (LSb): ");
+  cliptest(testbuffer2,test2size,0,three,threesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\n32 bit preclipped packing (LSb): ");
+  oggpack_reset(&o);
+  for(i=0;i<test2size;i++)
+    oggpack_write(&o,large[i],32);
+  buffer=oggpack_get_buffer(&o);
+  bytes=oggpack_bytes(&o);
+  oggpack_readinit(&r,buffer,bytes);
+  for(i=0;i<test2size;i++){
+    if(oggpack_look(&r,32)==-1)report("out of data. failed!");
+    if(oggpack_look(&r,32)!=large[i]){
+      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpack_look(&r,32),large[i],
+              oggpack_look(&r,32),large[i]);
+      report("read incorrect value!\n");
+    }
+    oggpack_adv(&r,32);
+  }
+  if(oggpack_bytes(&r)!=bytes)report("leftover bytes after read!\n");
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nSmall unclipped packing (LSb): ");
+  cliptest(testbuffer1,test1size,7,four,foursize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nLarge unclipped packing (LSb): ");
+  cliptest(testbuffer2,test2size,17,five,fivesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nSingle bit unclipped packing (LSb): ");
+  cliptest(testbuffer3,test3size,1,six,sixsize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nTesting read past end (LSb): ");
+  oggpack_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
+  for(i=0;i<64;i++){
+    if(oggpack_read(&r,1)!=0){
+      fprintf(stderr,"failed; got -1 prematurely.\n");
+      exit(1);
+    }
+  }
+  if(oggpack_look(&r,1)!=-1 ||
+     oggpack_read(&r,1)!=-1){
+      fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  oggpack_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
+  if(oggpack_read(&r,30)!=0 || oggpack_read(&r,16)!=0){
+      fprintf(stderr,"failed 2; got -1 prematurely.\n");
+      exit(1);
+  }
+
+  if(oggpack_look(&r,18)!=0 ||
+     oggpack_look(&r,18)!=0){
+    fprintf(stderr,"failed 3; got -1 prematurely.\n");
+      exit(1);
+  }
+  if(oggpack_look(&r,19)!=-1 ||
+     oggpack_look(&r,19)!=-1){
+    fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  if(oggpack_look(&r,32)!=-1 ||
+     oggpack_look(&r,32)!=-1){
+    fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  oggpack_writeclear(&o);
+  fprintf(stderr,"ok.\n");
+
+  /********** lazy, cut-n-paste retest with MSb packing ***********/
+
+  /* Test read/write together */
+  /* Later we test against pregenerated bitstreams */
+  oggpackB_writeinit(&o);
+
+  fprintf(stderr,"\nSmall preclipped packing (MSb): ");
+  cliptestB(testbuffer1,test1size,0,oneB,onesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nNull bit call (MSb): ");
+  cliptestB(testbuffer3,test3size,0,twoB,twosize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nLarge preclipped packing (MSb): ");
+  cliptestB(testbuffer2,test2size,0,threeB,threesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\n32 bit preclipped packing (MSb): ");
+  oggpackB_reset(&o);
+  for(i=0;i<test2size;i++)
+    oggpackB_write(&o,large[i],32);
+  buffer=oggpackB_get_buffer(&o);
+  bytes=oggpackB_bytes(&o);
+  oggpackB_readinit(&r,buffer,bytes);
+  for(i=0;i<test2size;i++){
+    if(oggpackB_look(&r,32)==-1)report("out of data. failed!");
+    if(oggpackB_look(&r,32)!=large[i]){
+      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpackB_look(&r,32),large[i],
+              oggpackB_look(&r,32),large[i]);
+      report("read incorrect value!\n");
+    }
+    oggpackB_adv(&r,32);
+  }
+  if(oggpackB_bytes(&r)!=bytes)report("leftover bytes after read!\n");
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nSmall unclipped packing (MSb): ");
+  cliptestB(testbuffer1,test1size,7,fourB,foursize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nLarge unclipped packing (MSb): ");
+  cliptestB(testbuffer2,test2size,17,fiveB,fivesize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nSingle bit unclipped packing (MSb): ");
+  cliptestB(testbuffer3,test3size,1,sixB,sixsize);
+  fprintf(stderr,"ok.");
+
+  fprintf(stderr,"\nTesting read past end (MSb): ");
+  oggpackB_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
+  for(i=0;i<64;i++){
+    if(oggpackB_read(&r,1)!=0){
+      fprintf(stderr,"failed; got -1 prematurely.\n");
+      exit(1);
+    }
+  }
+  if(oggpackB_look(&r,1)!=-1 ||
+     oggpackB_read(&r,1)!=-1){
+      fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  oggpackB_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
+  if(oggpackB_read(&r,30)!=0 || oggpackB_read(&r,16)!=0){
+      fprintf(stderr,"failed 2; got -1 prematurely.\n");
+      exit(1);
+  }
+
+  if(oggpackB_look(&r,18)!=0 ||
+     oggpackB_look(&r,18)!=0){
+    fprintf(stderr,"failed 3; got -1 prematurely.\n");
+      exit(1);
+  }
+  if(oggpackB_look(&r,19)!=-1 ||
+     oggpackB_look(&r,19)!=-1){
+    fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  if(oggpackB_look(&r,32)!=-1 ||
+     oggpackB_look(&r,32)!=-1){
+    fprintf(stderr,"failed; read past end without -1.\n");
+      exit(1);
+  }
+  oggpackB_writeclear(&o);
+  fprintf(stderr,"ok.\n\n");
+
+
+  return(0);
+}
+#endif  /* _V_SELFTEST */
+
+#undef BUFFER_INCREMENT

+ 2111 - 0
modules/theoraplayer/native/ogg/src/framing.c

@@ -0,0 +1,2111 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE.              *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+ function: code raw packets into framed OggSquish stream and
+           decode Ogg streams back into raw packets
+ last mod: $Id: framing.c 18758 2013-01-08 16:29:56Z tterribe $
+
+ note: The CRC code is directly derived from public domain code by
+ Ross Williams ([email protected]).  See docs/framing.html
+ for details.
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <ogg/ogg.h>
+
+/* A complete description of Ogg framing exists in docs/framing.html */
+
+int ogg_page_version(const ogg_page *og){
+  return((int)(og->header[4]));
+}
+
+int ogg_page_continued(const ogg_page *og){
+  return((int)(og->header[5]&0x01));
+}
+
+int ogg_page_bos(const ogg_page *og){
+  return((int)(og->header[5]&0x02));
+}
+
+int ogg_page_eos(const ogg_page *og){
+  return((int)(og->header[5]&0x04));
+}
+
+ogg_int64_t ogg_page_granulepos(const ogg_page *og){
+  unsigned char *page=og->header;
+  ogg_int64_t granulepos=page[13]&(0xff);
+  granulepos= (granulepos<<8)|(page[12]&0xff);
+  granulepos= (granulepos<<8)|(page[11]&0xff);
+  granulepos= (granulepos<<8)|(page[10]&0xff);
+  granulepos= (granulepos<<8)|(page[9]&0xff);
+  granulepos= (granulepos<<8)|(page[8]&0xff);
+  granulepos= (granulepos<<8)|(page[7]&0xff);
+  granulepos= (granulepos<<8)|(page[6]&0xff);
+  return(granulepos);
+}
+
+int ogg_page_serialno(const ogg_page *og){
+  return(og->header[14] |
+         (og->header[15]<<8) |
+         (og->header[16]<<16) |
+         (og->header[17]<<24));
+}
+
+long ogg_page_pageno(const ogg_page *og){
+  return(og->header[18] |
+         (og->header[19]<<8) |
+         (og->header[20]<<16) |
+         (og->header[21]<<24));
+}
+
+
+
+/* returns the number of packets that are completed on this page (if
+   the leading packet is begun on a previous page, but ends on this
+   page, it's counted */
+
+/* NOTE:
+   If a page consists of a packet begun on a previous page, and a new
+   packet begun (but not completed) on this page, the return will be:
+     ogg_page_packets(page)   ==1,
+     ogg_page_continued(page) !=0
+
+   If a page happens to be a single packet that was begun on a
+   previous page, and spans to the next page (in the case of a three or
+   more page packet), the return will be:
+     ogg_page_packets(page)   ==0,
+     ogg_page_continued(page) !=0
+*/
+
+int ogg_page_packets(const ogg_page *og){
+  int i,n=og->header[26],count=0;
+  for(i=0;i<n;i++)
+    if(og->header[27+i]<255)count++;
+  return(count);
+}
+
+
+#if 0
+/* helper to initialize lookup for direct-table CRC (illustrative; we
+   use the static init below) */
+
+static ogg_uint32_t _ogg_crc_entry(unsigned long index){
+  int           i;
+  unsigned long r;
+
+  r = index << 24;
+  for (i=0; i<8; i++)
+    if (r & 0x80000000UL)
+      r = (r << 1) ^ 0x04c11db7; /* The same as the ethernet generator
+                                    polynomial, although we use an
+                                    unreflected alg and an init/final
+                                    of 0, not 0xffffffff */
+    else
+       r<<=1;
+ return (r & 0xffffffffUL);
+}
+#endif
+
+static const ogg_uint32_t crc_lookup[256]={
+  0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,
+  0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
+  0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,
+  0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
+  0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,
+  0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
+  0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,
+  0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
+  0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,
+  0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
+  0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,
+  0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
+  0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,
+  0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
+  0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,
+  0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
+  0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,
+  0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
+  0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,
+  0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
+  0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,
+  0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
+  0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,
+  0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
+  0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,
+  0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
+  0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,
+  0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
+  0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,
+  0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
+  0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,
+  0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
+  0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,
+  0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
+  0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,
+  0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
+  0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,
+  0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
+  0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,
+  0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
+  0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,
+  0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
+  0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,
+  0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
+  0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,
+  0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
+  0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,
+  0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
+  0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,
+  0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
+  0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,
+  0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
+  0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,
+  0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
+  0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,
+  0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
+  0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,
+  0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
+  0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,
+  0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
+  0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,
+  0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
+  0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,
+  0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4};
+
+/* init the encode/decode logical stream state */
+
+int ogg_stream_init(ogg_stream_state *os,int serialno){
+  if(os){
+    memset(os,0,sizeof(*os));
+    os->body_storage=16*1024;
+    os->lacing_storage=1024;
+
+    os->body_data=_ogg_malloc(os->body_storage*sizeof(*os->body_data));
+    os->lacing_vals=_ogg_malloc(os->lacing_storage*sizeof(*os->lacing_vals));
+    os->granule_vals=_ogg_malloc(os->lacing_storage*sizeof(*os->granule_vals));
+
+    if(!os->body_data || !os->lacing_vals || !os->granule_vals){
+      ogg_stream_clear(os);
+      return -1;
+    }
+
+    os->serialno=serialno;
+
+    return(0);
+  }
+  return(-1);
+}
+
+/* async/delayed error detection for the ogg_stream_state */
+int ogg_stream_check(ogg_stream_state *os){
+  if(!os || !os->body_data) return -1;
+  return 0;
+}
+
+/* _clear does not free os, only the non-flat storage within */
+int ogg_stream_clear(ogg_stream_state *os){
+  if(os){
+    if(os->body_data)_ogg_free(os->body_data);
+    if(os->lacing_vals)_ogg_free(os->lacing_vals);
+    if(os->granule_vals)_ogg_free(os->granule_vals);
+
+    memset(os,0,sizeof(*os));
+  }
+  return(0);
+}
+
+int ogg_stream_destroy(ogg_stream_state *os){
+  if(os){
+    ogg_stream_clear(os);
+    _ogg_free(os);
+  }
+  return(0);
+}
+
+/* Helpers for ogg_stream_encode; this keeps the structure and
+   what's happening fairly clear */
+
+static int _os_body_expand(ogg_stream_state *os,long needed){
+  if(os->body_storage-needed<=os->body_fill){
+    long body_storage;
+    void *ret;
+    if(os->body_storage>LONG_MAX-needed){
+      ogg_stream_clear(os);
+      return -1;
+    }
+    body_storage=os->body_storage+needed;
+    if(body_storage<LONG_MAX-1024)body_storage+=1024;
+    ret=_ogg_realloc(os->body_data,body_storage*sizeof(*os->body_data));
+    if(!ret){
+      ogg_stream_clear(os);
+      return -1;
+    }
+    os->body_storage=body_storage;
+    os->body_data=ret;
+  }
+  return 0;
+}
+
+static int _os_lacing_expand(ogg_stream_state *os,long needed){
+  if(os->lacing_storage-needed<=os->lacing_fill){
+    long lacing_storage;
+    void *ret;
+    if(os->lacing_storage>LONG_MAX-needed){
+      ogg_stream_clear(os);
+      return -1;
+    }
+    lacing_storage=os->lacing_storage+needed;
+    if(lacing_storage<LONG_MAX-32)lacing_storage+=32;
+    ret=_ogg_realloc(os->lacing_vals,lacing_storage*sizeof(*os->lacing_vals));
+    if(!ret){
+      ogg_stream_clear(os);
+      return -1;
+    }
+    os->lacing_vals=ret;
+    ret=_ogg_realloc(os->granule_vals,lacing_storage*
+                     sizeof(*os->granule_vals));
+    if(!ret){
+      ogg_stream_clear(os);
+      return -1;
+    }
+    os->granule_vals=ret;
+    os->lacing_storage=lacing_storage;
+  }
+  return 0;
+}
+
+/* checksum the page */
+/* Direct table CRC; note that this will be faster in the future if we
+   perform the checksum simultaneously with other copies */
+
+void ogg_page_checksum_set(ogg_page *og){
+  if(og){
+    ogg_uint32_t crc_reg=0;
+    int i;
+
+    /* safety; needed for API behavior, but not framing code */
+    og->header[22]=0;
+    og->header[23]=0;
+    og->header[24]=0;
+    og->header[25]=0;
+
+    for(i=0;i<og->header_len;i++)
+      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->header[i]];
+    for(i=0;i<og->body_len;i++)
+      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->body[i]];
+
+    og->header[22]=(unsigned char)(crc_reg&0xff);
+    og->header[23]=(unsigned char)((crc_reg>>8)&0xff);
+    og->header[24]=(unsigned char)((crc_reg>>16)&0xff);
+    og->header[25]=(unsigned char)((crc_reg>>24)&0xff);
+  }
+}
+
+/* submit data to the internal buffer of the framing engine */
+int ogg_stream_iovecin(ogg_stream_state *os, ogg_iovec_t *iov, int count,
+                       long e_o_s, ogg_int64_t granulepos){
+
+  long bytes = 0, lacing_vals;
+  int i;
+
+  if(ogg_stream_check(os)) return -1;
+  if(!iov) return 0;
+
+  for (i = 0; i < count; ++i){
+    if(iov[i].iov_len>LONG_MAX) return -1;
+    if(bytes>LONG_MAX-(long)iov[i].iov_len) return -1;
+    bytes += (long)iov[i].iov_len;
+  }
+  lacing_vals=bytes/255+1;
+
+  if(os->body_returned){
+    /* advance packet data according to the body_returned pointer. We
+       had to keep it around to return a pointer into the buffer last
+       call */
+
+    os->body_fill-=os->body_returned;
+    if(os->body_fill)
+      memmove(os->body_data,os->body_data+os->body_returned,
+              os->body_fill);
+    os->body_returned=0;
+  }
+
+  /* make sure we have the buffer storage */
+  if(_os_body_expand(os,bytes) || _os_lacing_expand(os,lacing_vals))
+    return -1;
+
+  /* Copy in the submitted packet.  Yes, the copy is a waste; this is
+     the liability of overly clean abstraction for the time being.  It
+     will actually be fairly easy to eliminate the extra copy in the
+     future */
+
+  for (i = 0; i < count; ++i) {
+    memcpy(os->body_data+os->body_fill, iov[i].iov_base, iov[i].iov_len);
+    os->body_fill += (int)iov[i].iov_len;
+  }
+
+  /* Store lacing vals for this packet */
+  for(i=0;i<lacing_vals-1;i++){
+    os->lacing_vals[os->lacing_fill+i]=255;
+    os->granule_vals[os->lacing_fill+i]=os->granulepos;
+  }
+  os->lacing_vals[os->lacing_fill+i]=bytes%255;
+  os->granulepos=os->granule_vals[os->lacing_fill+i]=granulepos;
+
+  /* flag the first segment as the beginning of the packet */
+  os->lacing_vals[os->lacing_fill]|= 0x100;
+
+  os->lacing_fill+=lacing_vals;
+
+  /* for the sake of completeness */
+  os->packetno++;
+
+  if(e_o_s)os->e_o_s=1;
+
+  return(0);
+}
+
+int ogg_stream_packetin(ogg_stream_state *os,ogg_packet *op){
+  ogg_iovec_t iov;
+  iov.iov_base = op->packet;
+  iov.iov_len = op->bytes;
+  return ogg_stream_iovecin(os, &iov, 1, op->e_o_s, op->granulepos);
+}
+
+/* Conditionally flush a page; force==0 will only flush nominal-size
+   pages, force==1 forces us to flush a page regardless of page size
+   so long as there's any data available at all. */
+static int ogg_stream_flush_i(ogg_stream_state *os,ogg_page *og, int force, int nfill){
+  int i;
+  int vals=0;
+  int maxvals=(os->lacing_fill>255?255:os->lacing_fill);
+  int bytes=0;
+  long acc=0;
+  ogg_int64_t granule_pos=-1;
+
+  if(ogg_stream_check(os)) return(0);
+  if(maxvals==0) return(0);
+
+  /* construct a page */
+  /* decide how many segments to include */
+
+  /* If this is the initial header case, the first page must only include
+     the initial header packet */
+  if(os->b_o_s==0){  /* 'initial header page' case */
+    granule_pos=0;
+    for(vals=0;vals<maxvals;vals++){
+      if((os->lacing_vals[vals]&0x0ff)<255){
+        vals++;
+        break;
+      }
+    }
+  }else{
+
+    /* The extra packets_done, packet_just_done logic here attempts to do two things:
+       1) Don't unneccessarily span pages.
+       2) Unless necessary, don't flush pages if there are less than four packets on
+          them; this expands page size to reduce unneccessary overhead if incoming packets
+          are large.
+       These are not necessary behaviors, just 'always better than naive flushing'
+       without requiring an application to explicitly request a specific optimized
+       behavior. We'll want an explicit behavior setup pathway eventually as well. */
+
+    int packets_done=0;
+    int packet_just_done=0;
+    for(vals=0;vals<maxvals;vals++){
+      if(acc>nfill && packet_just_done>=4){
+        force=1;
+        break;
+      }
+      acc+=os->lacing_vals[vals]&0x0ff;
+      if((os->lacing_vals[vals]&0xff)<255){
+        granule_pos=os->granule_vals[vals];
+        packet_just_done=++packets_done;
+      }else
+        packet_just_done=0;
+    }
+    if(vals==255)force=1;
+  }
+
+  if(!force) return(0);
+
+  /* construct the header in temp storage */
+  memcpy(os->header,"OggS",4);
+
+  /* stream structure version */
+  os->header[4]=0x00;
+
+  /* continued packet flag? */
+  os->header[5]=0x00;
+  if((os->lacing_vals[0]&0x100)==0)os->header[5]|=0x01;
+  /* first page flag? */
+  if(os->b_o_s==0)os->header[5]|=0x02;
+  /* last page flag? */
+  if(os->e_o_s && os->lacing_fill==vals)os->header[5]|=0x04;
+  os->b_o_s=1;
+
+  /* 64 bits of PCM position */
+  for(i=6;i<14;i++){
+    os->header[i]=(unsigned char)(granule_pos&0xff);
+    granule_pos>>=8;
+  }
+
+  /* 32 bits of stream serial number */
+  {
+    long serialno=os->serialno;
+    for(i=14;i<18;i++){
+      os->header[i]=(unsigned char)(serialno&0xff);
+      serialno>>=8;
+    }
+  }
+
+  /* 32 bits of page counter (we have both counter and page header
+     because this val can roll over) */
+  if(os->pageno==-1)os->pageno=0; /* because someone called
+                                     stream_reset; this would be a
+                                     strange thing to do in an
+                                     encode stream, but it has
+                                     plausible uses */
+  {
+    long pageno=os->pageno++;
+    for(i=18;i<22;i++){
+      os->header[i]=(unsigned char)(pageno&0xff);
+      pageno>>=8;
+    }
+  }
+
+  /* zero for computation; filled in later */
+  os->header[22]=0;
+  os->header[23]=0;
+  os->header[24]=0;
+  os->header[25]=0;
+
+  /* segment table */
+  os->header[26]=(unsigned char)(vals&0xff);
+  for(i=0;i<vals;i++)
+    bytes+=os->header[i+27]=(unsigned char)(os->lacing_vals[i]&0xff);
+
+  /* set pointers in the ogg_page struct */
+  og->header=os->header;
+  og->header_len=os->header_fill=vals+27;
+  og->body=os->body_data+os->body_returned;
+  og->body_len=bytes;
+
+  /* advance the lacing data and set the body_returned pointer */
+
+  os->lacing_fill-=vals;
+  memmove(os->lacing_vals,os->lacing_vals+vals,os->lacing_fill*sizeof(*os->lacing_vals));
+  memmove(os->granule_vals,os->granule_vals+vals,os->lacing_fill*sizeof(*os->granule_vals));
+  os->body_returned+=bytes;
+
+  /* calculate the checksum */
+
+  ogg_page_checksum_set(og);
+
+  /* done */
+  return(1);
+}
+
+/* This will flush remaining packets into a page (returning nonzero),
+   even if there is not enough data to trigger a flush normally
+   (undersized page). If there are no packets or partial packets to
+   flush, ogg_stream_flush returns 0.  Note that ogg_stream_flush will
+   try to flush a normal sized page like ogg_stream_pageout; a call to
+   ogg_stream_flush does not guarantee that all packets have flushed.
+   Only a return value of 0 from ogg_stream_flush indicates all packet
+   data is flushed into pages.
+
+   since ogg_stream_flush will flush the last page in a stream even if
+   it's undersized, you almost certainly want to use ogg_stream_pageout
+   (and *not* ogg_stream_flush) unless you specifically need to flush
+   a page regardless of size in the middle of a stream. */
+
+int ogg_stream_flush(ogg_stream_state *os,ogg_page *og){
+  return ogg_stream_flush_i(os,og,1,4096);
+}
+
+/* Like the above, but an argument is provided to adjust the nominal
+   page size for applications which are smart enough to provide their
+   own delay based flushing */
+
+int ogg_stream_flush_fill(ogg_stream_state *os,ogg_page *og, int nfill){
+  return ogg_stream_flush_i(os,og,1,nfill);
+}
+
+/* This constructs pages from buffered packet segments.  The pointers
+returned are to static buffers; do not free. The returned buffers are
+good only until the next call (using the same ogg_stream_state) */
+
+int ogg_stream_pageout(ogg_stream_state *os, ogg_page *og){
+  int force=0;
+  if(ogg_stream_check(os)) return 0;
+
+  if((os->e_o_s&&os->lacing_fill) ||          /* 'were done, now flush' case */
+     (os->lacing_fill&&!os->b_o_s))           /* 'initial header page' case */
+    force=1;
+
+  return(ogg_stream_flush_i(os,og,force,4096));
+}
+
+/* Like the above, but an argument is provided to adjust the nominal
+page size for applications which are smart enough to provide their
+own delay based flushing */
+
+int ogg_stream_pageout_fill(ogg_stream_state *os, ogg_page *og, int nfill){
+  int force=0;
+  if(ogg_stream_check(os)) return 0;
+
+  if((os->e_o_s&&os->lacing_fill) ||          /* 'were done, now flush' case */
+     (os->lacing_fill&&!os->b_o_s))           /* 'initial header page' case */
+    force=1;
+
+  return(ogg_stream_flush_i(os,og,force,nfill));
+}
+
+int ogg_stream_eos(ogg_stream_state *os){
+  if(ogg_stream_check(os)) return 1;
+  return os->e_o_s;
+}
+
+/* DECODING PRIMITIVES: packet streaming layer **********************/
+
+/* This has two layers to place more of the multi-serialno and paging
+   control in the application's hands.  First, we expose a data buffer
+   using ogg_sync_buffer().  The app either copies into the
+   buffer, or passes it directly to read(), etc.  We then call
+   ogg_sync_wrote() to tell how many bytes we just added.
+
+   Pages are returned (pointers into the buffer in ogg_sync_state)
+   by ogg_sync_pageout().  The page is then submitted to
+   ogg_stream_pagein() along with the appropriate
+   ogg_stream_state* (ie, matching serialno).  We then get raw
+   packets out calling ogg_stream_packetout() with a
+   ogg_stream_state. */
+
+/* initialize the struct to a known state */
+int ogg_sync_init(ogg_sync_state *oy){
+  if(oy){
+    oy->storage = -1; /* used as a readiness flag */
+    memset(oy,0,sizeof(*oy));
+  }
+  return(0);
+}
+
+/* clear non-flat storage within */
+int ogg_sync_clear(ogg_sync_state *oy){
+  if(oy){
+    if(oy->data)_ogg_free(oy->data);
+    memset(oy,0,sizeof(*oy));
+  }
+  return(0);
+}
+
+int ogg_sync_destroy(ogg_sync_state *oy){
+  if(oy){
+    ogg_sync_clear(oy);
+    _ogg_free(oy);
+  }
+  return(0);
+}
+
+int ogg_sync_check(ogg_sync_state *oy){
+  if(oy->storage<0) return -1;
+  return 0;
+}
+
+char *ogg_sync_buffer(ogg_sync_state *oy, long size){
+  if(ogg_sync_check(oy)) return NULL;
+
+  /* first, clear out any space that has been previously returned */
+  if(oy->returned){
+    oy->fill-=oy->returned;
+    if(oy->fill>0)
+      memmove(oy->data,oy->data+oy->returned,oy->fill);
+    oy->returned=0;
+  }
+
+  if(size>oy->storage-oy->fill){
+    /* We need to extend the internal buffer */
+    long newsize=size+oy->fill+4096; /* an extra page to be nice */
+    void *ret;
+
+    if(oy->data)
+      ret=_ogg_realloc(oy->data,newsize);
+    else
+      ret=_ogg_malloc(newsize);
+    if(!ret){
+      ogg_sync_clear(oy);
+      return NULL;
+    }
+    oy->data=ret;
+    oy->storage=newsize;
+  }
+
+  /* expose a segment at least as large as requested at the fill mark */
+  return((char *)oy->data+oy->fill);
+}
+
+int ogg_sync_wrote(ogg_sync_state *oy, long bytes){
+  if(ogg_sync_check(oy))return -1;
+  if(oy->fill+bytes>oy->storage)return -1;
+  oy->fill+=bytes;
+  return(0);
+}
+
+/* sync the stream.  This is meant to be useful for finding page
+   boundaries.
+
+   return values for this:
+  -n) skipped n bytes
+   0) page not ready; more data (no bytes skipped)
+   n) page synced at current location; page length n bytes
+
+*/
+
+long ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og){
+  unsigned char *page=oy->data+oy->returned;
+  unsigned char *next;
+  long bytes=oy->fill-oy->returned;
+
+  if(ogg_sync_check(oy))return 0;
+
+  if(oy->headerbytes==0){
+    int headerbytes,i;
+    if(bytes<27)return(0); /* not enough for a header */
+
+    /* verify capture pattern */
+    if(memcmp(page,"OggS",4))goto sync_fail;
+
+    headerbytes=page[26]+27;
+    if(bytes<headerbytes)return(0); /* not enough for header + seg table */
+
+    /* count up body length in the segment table */
+
+    for(i=0;i<page[26];i++)
+      oy->bodybytes+=page[27+i];
+    oy->headerbytes=headerbytes;
+  }
+
+  if(oy->bodybytes+oy->headerbytes>bytes)return(0);
+
+  /* The whole test page is buffered.  Verify the checksum */
+  {
+    /* Grab the checksum bytes, set the header field to zero */
+    char chksum[4];
+    ogg_page log;
+
+    memcpy(chksum,page+22,4);
+    memset(page+22,0,4);
+
+    /* set up a temp page struct and recompute the checksum */
+    log.header=page;
+    log.header_len=oy->headerbytes;
+    log.body=page+oy->headerbytes;
+    log.body_len=oy->bodybytes;
+    ogg_page_checksum_set(&log);
+
+    /* Compare */
+    if(memcmp(chksum,page+22,4)){
+      /* D'oh.  Mismatch! Corrupt page (or miscapture and not a page
+         at all) */
+      /* replace the computed checksum with the one actually read in */
+      memcpy(page+22,chksum,4);
+
+      /* Bad checksum. Lose sync */
+      goto sync_fail;
+    }
+  }
+
+  /* yes, have a whole page all ready to go */
+  {
+    unsigned char *page=oy->data+oy->returned;
+    long bytes;
+
+    if(og){
+      og->header=page;
+      og->header_len=oy->headerbytes;
+      og->body=page+oy->headerbytes;
+      og->body_len=oy->bodybytes;
+    }
+
+    oy->unsynced=0;
+    oy->returned+=(bytes=oy->headerbytes+oy->bodybytes);
+    oy->headerbytes=0;
+    oy->bodybytes=0;
+    return(bytes);
+  }
+
+ sync_fail:
+
+  oy->headerbytes=0;
+  oy->bodybytes=0;
+
+  /* search for possible capture */
+  next=memchr(page+1,'O',bytes-1);
+  if(!next)
+    next=oy->data+oy->fill;
+
+  oy->returned=(int)(next-oy->data);
+  return((long)-(next-page));
+}
+
+/* sync the stream and get a page.  Keep trying until we find a page.
+   Suppress 'sync errors' after reporting the first.
+
+   return values:
+   -1) recapture (hole in data)
+    0) need more data
+    1) page returned
+
+   Returns pointers into buffered data; invalidated by next call to
+   _stream, _clear, _init, or _buffer */
+
+int ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og){
+
+  if(ogg_sync_check(oy))return 0;
+
+  /* all we need to do is verify a page at the head of the stream
+     buffer.  If it doesn't verify, we look for the next potential
+     frame */
+
+  for(;;){
+    long ret=ogg_sync_pageseek(oy,og);
+    if(ret>0){
+      /* have a page */
+      return(1);
+    }
+    if(ret==0){
+      /* need more data */
+      return(0);
+    }
+
+    /* head did not start a synced page... skipped some bytes */
+    if(!oy->unsynced){
+      oy->unsynced=1;
+      return(-1);
+    }
+
+    /* loop. keep looking */
+
+  }
+}
+
+/* add the incoming page to the stream state; we decompose the page
+   into packet segments here as well. */
+
+int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og){
+  unsigned char *header=og->header;
+  unsigned char *body=og->body;
+  long           bodysize=og->body_len;
+  int            segptr=0;
+
+  int version=ogg_page_version(og);
+  int continued=ogg_page_continued(og);
+  int bos=ogg_page_bos(og);
+  int eos=ogg_page_eos(og);
+  ogg_int64_t granulepos=ogg_page_granulepos(og);
+  int serialno=ogg_page_serialno(og);
+  long pageno=ogg_page_pageno(og);
+  int segments=header[26];
+
+  if(ogg_stream_check(os)) return -1;
+
+  /* clean up 'returned data' */
+  {
+    long lr=os->lacing_returned;
+    long br=os->body_returned;
+
+    /* body data */
+    if(br){
+      os->body_fill-=br;
+      if(os->body_fill)
+        memmove(os->body_data,os->body_data+br,os->body_fill);
+      os->body_returned=0;
+    }
+
+    if(lr){
+      /* segment table */
+      if(os->lacing_fill-lr){
+        memmove(os->lacing_vals,os->lacing_vals+lr,
+                (os->lacing_fill-lr)*sizeof(*os->lacing_vals));
+        memmove(os->granule_vals,os->granule_vals+lr,
+                (os->lacing_fill-lr)*sizeof(*os->granule_vals));
+      }
+      os->lacing_fill-=lr;
+      os->lacing_packet-=lr;
+      os->lacing_returned=0;
+    }
+  }
+
+  /* check the serial number */
+  if(serialno!=os->serialno)return(-1);
+  if(version>0)return(-1);
+
+  if(_os_lacing_expand(os,segments+1)) return -1;
+
+  /* are we in sequence? */
+  if(pageno!=os->pageno){
+    int i;
+
+    /* unroll previous partial packet (if any) */
+    for(i=os->lacing_packet;i<os->lacing_fill;i++)
+      os->body_fill-=os->lacing_vals[i]&0xff;
+    os->lacing_fill=os->lacing_packet;
+
+    /* make a note of dropped data in segment table */
+    if(os->pageno!=-1){
+      os->lacing_vals[os->lacing_fill++]=0x400;
+      os->lacing_packet++;
+    }
+  }
+
+  /* are we a 'continued packet' page?  If so, we may need to skip
+     some segments */
+  if(continued){
+    if(os->lacing_fill<1 ||
+       os->lacing_vals[os->lacing_fill-1]==0x400){
+      bos=0;
+      for(;segptr<segments;segptr++){
+        int val=header[27+segptr];
+        body+=val;
+        bodysize-=val;
+        if(val<255){
+          segptr++;
+          break;
+        }
+      }
+    }
+  }
+
+  if(bodysize){
+    if(_os_body_expand(os,bodysize)) return -1;
+    memcpy(os->body_data+os->body_fill,body,bodysize);
+    os->body_fill+=bodysize;
+  }
+
+  {
+    int saved=-1;
+    while(segptr<segments){
+      int val=header[27+segptr];
+      os->lacing_vals[os->lacing_fill]=val;
+      os->granule_vals[os->lacing_fill]=-1;
+
+      if(bos){
+        os->lacing_vals[os->lacing_fill]|=0x100;
+        bos=0;
+      }
+
+      if(val<255)saved=os->lacing_fill;
+
+      os->lacing_fill++;
+      segptr++;
+
+      if(val<255)os->lacing_packet=os->lacing_fill;
+    }
+
+    /* set the granulepos on the last granuleval of the last full packet */
+    if(saved!=-1){
+      os->granule_vals[saved]=granulepos;
+    }
+
+  }
+
+  if(eos){
+    os->e_o_s=1;
+    if(os->lacing_fill>0)
+      os->lacing_vals[os->lacing_fill-1]|=0x200;
+  }
+
+  os->pageno=pageno+1;
+
+  return(0);
+}
+
+/* clear things to an initial state.  Good to call, eg, before seeking */
+int ogg_sync_reset(ogg_sync_state *oy){
+  if(ogg_sync_check(oy))return -1;
+
+  oy->fill=0;
+  oy->returned=0;
+  oy->unsynced=0;
+  oy->headerbytes=0;
+  oy->bodybytes=0;
+  return(0);
+}
+
+int ogg_stream_reset(ogg_stream_state *os){
+  if(ogg_stream_check(os)) return -1;
+
+  os->body_fill=0;
+  os->body_returned=0;
+
+  os->lacing_fill=0;
+  os->lacing_packet=0;
+  os->lacing_returned=0;
+
+  os->header_fill=0;
+
+  os->e_o_s=0;
+  os->b_o_s=0;
+  os->pageno=-1;
+  os->packetno=0;
+  os->granulepos=0;
+
+  return(0);
+}
+
+int ogg_stream_reset_serialno(ogg_stream_state *os,int serialno){
+  if(ogg_stream_check(os)) return -1;
+  ogg_stream_reset(os);
+  os->serialno=serialno;
+  return(0);
+}
+
+static int _packetout(ogg_stream_state *os,ogg_packet *op,int adv){
+
+  /* The last part of decode. We have the stream broken into packet
+     segments.  Now we need to group them into packets (or return the
+     out of sync markers) */
+
+  int ptr=os->lacing_returned;
+
+  if(os->lacing_packet<=ptr)return(0);
+
+  if(os->lacing_vals[ptr]&0x400){
+    /* we need to tell the codec there's a gap; it might need to
+       handle previous packet dependencies. */
+    os->lacing_returned++;
+    os->packetno++;
+    return(-1);
+  }
+
+  if(!op && !adv)return(1); /* just using peek as an inexpensive way
+                               to ask if there's a whole packet
+                               waiting */
+
+  /* Gather the whole packet. We'll have no holes or a partial packet */
+  {
+    int size=os->lacing_vals[ptr]&0xff;
+    long bytes=size;
+    int eos=os->lacing_vals[ptr]&0x200; /* last packet of the stream? */
+    int bos=os->lacing_vals[ptr]&0x100; /* first packet of the stream? */
+
+    while(size==255){
+      int val=os->lacing_vals[++ptr];
+      size=val&0xff;
+      if(val&0x200)eos=0x200;
+      bytes+=size;
+    }
+
+    if(op){
+      op->e_o_s=eos;
+      op->b_o_s=bos;
+      op->packet=os->body_data+os->body_returned;
+      op->packetno=os->packetno;
+      op->granulepos=os->granule_vals[ptr];
+      op->bytes=bytes;
+    }
+
+    if(adv){
+      os->body_returned+=bytes;
+      os->lacing_returned=ptr+1;
+      os->packetno++;
+    }
+  }
+  return(1);
+}
+
+int ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op){
+  if(ogg_stream_check(os)) return 0;
+  return _packetout(os,op,1);
+}
+
+int ogg_stream_packetpeek(ogg_stream_state *os,ogg_packet *op){
+  if(ogg_stream_check(os)) return 0;
+  return _packetout(os,op,0);
+}
+
+void ogg_packet_clear(ogg_packet *op) {
+  _ogg_free(op->packet);
+  memset(op, 0, sizeof(*op));
+}
+
+#ifdef _V_SELFTEST
+#include <stdio.h>
+
+ogg_stream_state os_en, os_de;
+ogg_sync_state oy;
+
+void checkpacket(ogg_packet *op,long len, int no, long pos){
+  long j;
+  static int sequence=0;
+  static int lastno=0;
+
+  if(op->bytes!=len){
+    fprintf(stderr,"incorrect packet length (%ld != %ld)!\n",op->bytes,len);
+    exit(1);
+  }
+  if(op->granulepos!=pos){
+    fprintf(stderr,"incorrect packet granpos (%ld != %ld)!\n",(long)op->granulepos,pos);
+    exit(1);
+  }
+
+  /* packet number just follows sequence/gap; adjust the input number
+     for that */
+  if(no==0){
+    sequence=0;
+  }else{
+    sequence++;
+    if(no>lastno+1)
+      sequence++;
+  }
+  lastno=no;
+  if(op->packetno!=sequence){
+    fprintf(stderr,"incorrect packet sequence %ld != %d\n",
+            (long)(op->packetno),sequence);
+    exit(1);
+  }
+
+  /* Test data */
+  for(j=0;j<op->bytes;j++)
+    if(op->packet[j]!=((j+no)&0xff)){
+      fprintf(stderr,"body data mismatch (1) at pos %ld: %x!=%lx!\n\n",
+              j,op->packet[j],(j+no)&0xff);
+      exit(1);
+    }
+}
+
+void check_page(unsigned char *data,const int *header,ogg_page *og){
+  long j;
+  /* Test data */
+  for(j=0;j<og->body_len;j++)
+    if(og->body[j]!=data[j]){
+      fprintf(stderr,"body data mismatch (2) at pos %ld: %x!=%x!\n\n",
+              j,data[j],og->body[j]);
+      exit(1);
+    }
+
+  /* Test header */
+  for(j=0;j<og->header_len;j++){
+    if(og->header[j]!=header[j]){
+      fprintf(stderr,"header content mismatch at pos %ld:\n",j);
+      for(j=0;j<header[26]+27;j++)
+        fprintf(stderr," (%ld)%02x:%02x",j,header[j],og->header[j]);
+      fprintf(stderr,"\n");
+      exit(1);
+    }
+  }
+  if(og->header_len!=header[26]+27){
+    fprintf(stderr,"header length incorrect! (%ld!=%d)\n",
+            og->header_len,header[26]+27);
+    exit(1);
+  }
+}
+
+void print_header(ogg_page *og){
+  int j;
+  fprintf(stderr,"\nHEADER:\n");
+  fprintf(stderr,"  capture: %c %c %c %c  version: %d  flags: %x\n",
+          og->header[0],og->header[1],og->header[2],og->header[3],
+          (int)og->header[4],(int)og->header[5]);
+
+  fprintf(stderr,"  granulepos: %d  serialno: %d  pageno: %ld\n",
+          (og->header[9]<<24)|(og->header[8]<<16)|
+          (og->header[7]<<8)|og->header[6],
+          (og->header[17]<<24)|(og->header[16]<<16)|
+          (og->header[15]<<8)|og->header[14],
+          ((long)(og->header[21])<<24)|(og->header[20]<<16)|
+          (og->header[19]<<8)|og->header[18]);
+
+  fprintf(stderr,"  checksum: %02x:%02x:%02x:%02x\n  segments: %d (",
+          (int)og->header[22],(int)og->header[23],
+          (int)og->header[24],(int)og->header[25],
+          (int)og->header[26]);
+
+  for(j=27;j<og->header_len;j++)
+    fprintf(stderr,"%d ",(int)og->header[j]);
+  fprintf(stderr,")\n\n");
+}
+
+void copy_page(ogg_page *og){
+  unsigned char *temp=_ogg_malloc(og->header_len);
+  memcpy(temp,og->header,og->header_len);
+  og->header=temp;
+
+  temp=_ogg_malloc(og->body_len);
+  memcpy(temp,og->body,og->body_len);
+  og->body=temp;
+}
+
+void free_page(ogg_page *og){
+  _ogg_free (og->header);
+  _ogg_free (og->body);
+}
+
+void error(void){
+  fprintf(stderr,"error!\n");
+  exit(1);
+}
+
+/* 17 only */
+const int head1_0[] = {0x4f,0x67,0x67,0x53,0,0x06,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0x15,0xed,0xec,0x91,
+                       1,
+                       17};
+
+/* 17, 254, 255, 256, 500, 510, 600 byte, pad */
+const int head1_1[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0x59,0x10,0x6c,0x2c,
+                       1,
+                       17};
+const int head2_1[] = {0x4f,0x67,0x67,0x53,0,0x04,
+                       0x07,0x18,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0x89,0x33,0x85,0xce,
+                       13,
+                       254,255,0,255,1,255,245,255,255,0,
+                       255,255,90};
+
+/* nil packets; beginning,middle,end */
+const int head1_2[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0xff,0x7b,0x23,0x17,
+                       1,
+                       0};
+const int head2_2[] = {0x4f,0x67,0x67,0x53,0,0x04,
+                       0x07,0x28,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0x5c,0x3f,0x66,0xcb,
+                       17,
+                       17,254,255,0,0,255,1,0,255,245,255,255,0,
+                       255,255,90,0};
+
+/* large initial packet */
+const int head1_3[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0x01,0x27,0x31,0xaa,
+                       18,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,255,10};
+
+const int head2_3[] = {0x4f,0x67,0x67,0x53,0,0x04,
+                       0x07,0x08,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0x7f,0x4e,0x8a,0xd2,
+                       4,
+                       255,4,255,0};
+
+
+/* continuing packet test */
+const int head1_4[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0xff,0x7b,0x23,0x17,
+                       1,
+                       0};
+
+const int head2_4[] = {0x4f,0x67,0x67,0x53,0,0x00,
+                       0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0xf8,0x3c,0x19,0x79,
+                       255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255};
+
+const int head3_4[] = {0x4f,0x67,0x67,0x53,0,0x05,
+                       0x07,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,2,0,0,0,
+                       0x38,0xe6,0xb6,0x28,
+                       6,
+                       255,220,255,4,255,0};
+
+
+/* spill expansion test */
+const int head1_4b[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                        0x01,0x02,0x03,0x04,0,0,0,0,
+                        0xff,0x7b,0x23,0x17,
+                        1,
+                        0};
+
+const int head2_4b[] = {0x4f,0x67,0x67,0x53,0,0x00,
+                        0x07,0x10,0x00,0x00,0x00,0x00,0x00,0x00,
+                        0x01,0x02,0x03,0x04,1,0,0,0,
+                        0xce,0x8f,0x17,0x1a,
+                        23,
+                        255,255,255,255,255,255,255,255,
+                        255,255,255,255,255,255,255,255,255,10,255,4,255,0,0};
+
+
+const int head3_4b[] = {0x4f,0x67,0x67,0x53,0,0x04,
+                        0x07,0x14,0x00,0x00,0x00,0x00,0x00,0x00,
+                        0x01,0x02,0x03,0x04,2,0,0,0,
+                        0x9b,0xb2,0x50,0xa1,
+                        1,
+                        0};
+
+/* page with the 255 segment limit */
+const int head1_5[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0xff,0x7b,0x23,0x17,
+                       1,
+                       0};
+
+const int head2_5[] = {0x4f,0x67,0x67,0x53,0,0x00,
+                       0x07,0xfc,0x03,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0xed,0x2a,0x2e,0xa7,
+                       255,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10,10,
+                       10,10,10,10,10,10,10};
+
+const int head3_5[] = {0x4f,0x67,0x67,0x53,0,0x04,
+                       0x07,0x00,0x04,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,2,0,0,0,
+                       0x6c,0x3b,0x82,0x3d,
+                       1,
+                       50};
+
+
+/* packet that overspans over an entire page */
+const int head1_6[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0xff,0x7b,0x23,0x17,
+                       1,
+                       0};
+
+const int head2_6[] = {0x4f,0x67,0x67,0x53,0,0x00,
+                       0x07,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0x68,0x22,0x7c,0x3d,
+                       255,
+                       100,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255};
+
+const int head3_6[] = {0x4f,0x67,0x67,0x53,0,0x01,
+                       0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+                       0x01,0x02,0x03,0x04,2,0,0,0,
+                       0xf4,0x87,0xba,0xf3,
+                       255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255};
+
+const int head4_6[] = {0x4f,0x67,0x67,0x53,0,0x05,
+                       0x07,0x10,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,3,0,0,0,
+                       0xf7,0x2f,0x6c,0x60,
+                       5,
+                       254,255,4,255,0};
+
+/* packet that overspans over an entire page */
+const int head1_7[] = {0x4f,0x67,0x67,0x53,0,0x02,
+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,0,0,0,0,
+                       0xff,0x7b,0x23,0x17,
+                       1,
+                       0};
+
+const int head2_7[] = {0x4f,0x67,0x67,0x53,0,0x00,
+                       0x07,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,1,0,0,0,
+                       0x68,0x22,0x7c,0x3d,
+                       255,
+                       100,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255,255,255,
+                       255,255,255,255,255,255};
+
+const int head3_7[] = {0x4f,0x67,0x67,0x53,0,0x05,
+                       0x07,0x08,0x00,0x00,0x00,0x00,0x00,0x00,
+                       0x01,0x02,0x03,0x04,2,0,0,0,
+                       0xd4,0xe0,0x60,0xe5,
+                       1,
+                       0};
+
+void test_pack(const int *pl, const int **headers, int byteskip,
+               int pageskip, int packetskip){
+  unsigned char *data=_ogg_malloc(1024*1024); /* for scripted test cases only */
+  long inptr=0;
+  long outptr=0;
+  long deptr=0;
+  long depacket=0;
+  long granule_pos=7,pageno=0;
+  int i,j,packets,pageout=pageskip;
+  int eosflag=0;
+  int bosflag=0;
+
+  int byteskipcount=0;
+
+  ogg_stream_reset(&os_en);
+  ogg_stream_reset(&os_de);
+  ogg_sync_reset(&oy);
+
+  for(packets=0;packets<packetskip;packets++)
+    depacket+=pl[packets];
+
+  for(packets=0;;packets++)if(pl[packets]==-1)break;
+
+  for(i=0;i<packets;i++){
+    /* construct a test packet */
+    ogg_packet op;
+    int len=pl[i];
+
+    op.packet=data+inptr;
+    op.bytes=len;
+    op.e_o_s=(pl[i+1]<0?1:0);
+    op.granulepos=granule_pos;
+
+    granule_pos+=1024;
+
+    for(j=0;j<len;j++)data[inptr++]=i+j;
+
+    /* submit the test packet */
+    ogg_stream_packetin(&os_en,&op);
+
+    /* retrieve any finished pages */
+    {
+      ogg_page og;
+
+      while(ogg_stream_pageout(&os_en,&og)){
+        /* We have a page.  Check it carefully */
+
+        fprintf(stderr,"%ld, ",pageno);
+
+        if(headers[pageno]==NULL){
+          fprintf(stderr,"coded too many pages!\n");
+          exit(1);
+        }
+
+        check_page(data+outptr,headers[pageno],&og);
+
+        outptr+=og.body_len;
+        pageno++;
+        if(pageskip){
+          bosflag=1;
+          pageskip--;
+          deptr+=og.body_len;
+        }
+
+        /* have a complete page; submit it to sync/decode */
+
+        {
+          ogg_page og_de;
+          ogg_packet op_de,op_de2;
+          char *buf=ogg_sync_buffer(&oy,og.header_len+og.body_len);
+          char *next=buf;
+          byteskipcount+=og.header_len;
+          if(byteskipcount>byteskip){
+            memcpy(next,og.header,byteskipcount-byteskip);
+            next+=byteskipcount-byteskip;
+            byteskipcount=byteskip;
+          }
+
+          byteskipcount+=og.body_len;
+          if(byteskipcount>byteskip){
+            memcpy(next,og.body,byteskipcount-byteskip);
+            next+=byteskipcount-byteskip;
+            byteskipcount=byteskip;
+          }
+
+          ogg_sync_wrote(&oy,next-buf);
+
+          while(1){
+            int ret=ogg_sync_pageout(&oy,&og_de);
+            if(ret==0)break;
+            if(ret<0)continue;
+            /* got a page.  Happy happy.  Verify that it's good. */
+
+            fprintf(stderr,"(%d), ",pageout);
+
+            check_page(data+deptr,headers[pageout],&og_de);
+            deptr+=og_de.body_len;
+            pageout++;
+
+            /* submit it to deconstitution */
+            ogg_stream_pagein(&os_de,&og_de);
+
+            /* packets out? */
+            while(ogg_stream_packetpeek(&os_de,&op_de2)>0){
+              ogg_stream_packetpeek(&os_de,NULL);
+              ogg_stream_packetout(&os_de,&op_de); /* just catching them all */
+
+              /* verify peek and out match */
+              if(memcmp(&op_de,&op_de2,sizeof(op_de))){
+                fprintf(stderr,"packetout != packetpeek! pos=%ld\n",
+                        depacket);
+                exit(1);
+              }
+
+              /* verify the packet! */
+              /* check data */
+              if(memcmp(data+depacket,op_de.packet,op_de.bytes)){
+                fprintf(stderr,"packet data mismatch in decode! pos=%ld\n",
+                        depacket);
+                exit(1);
+              }
+              /* check bos flag */
+              if(bosflag==0 && op_de.b_o_s==0){
+                fprintf(stderr,"b_o_s flag not set on packet!\n");
+                exit(1);
+              }
+              if(bosflag && op_de.b_o_s){
+                fprintf(stderr,"b_o_s flag incorrectly set on packet!\n");
+                exit(1);
+              }
+              bosflag=1;
+              depacket+=op_de.bytes;
+
+              /* check eos flag */
+              if(eosflag){
+                fprintf(stderr,"Multiple decoded packets with eos flag!\n");
+                exit(1);
+              }
+
+              if(op_de.e_o_s)eosflag=1;
+
+              /* check granulepos flag */
+              if(op_de.granulepos!=-1){
+                fprintf(stderr," granule:%ld ",(long)op_de.granulepos);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  _ogg_free(data);
+  if(headers[pageno]!=NULL){
+    fprintf(stderr,"did not write last page!\n");
+    exit(1);
+  }
+  if(headers[pageout]!=NULL){
+    fprintf(stderr,"did not decode last page!\n");
+    exit(1);
+  }
+  if(inptr!=outptr){
+    fprintf(stderr,"encoded page data incomplete!\n");
+    exit(1);
+  }
+  if(inptr!=deptr){
+    fprintf(stderr,"decoded page data incomplete!\n");
+    exit(1);
+  }
+  if(inptr!=depacket){
+    fprintf(stderr,"decoded packet data incomplete!\n");
+    exit(1);
+  }
+  if(!eosflag){
+    fprintf(stderr,"Never got a packet with EOS set!\n");
+    exit(1);
+  }
+  fprintf(stderr,"ok.\n");
+}
+
+int main(void){
+
+  ogg_stream_init(&os_en,0x04030201);
+  ogg_stream_init(&os_de,0x04030201);
+  ogg_sync_init(&oy);
+
+  /* Exercise each code path in the framing code.  Also verify that
+     the checksums are working.  */
+
+  {
+    /* 17 only */
+    const int packets[]={17, -1};
+    const int *headret[]={head1_0,NULL};
+
+    fprintf(stderr,"testing single page encoding... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* 17, 254, 255, 256, 500, 510, 600 byte, pad */
+    const int packets[]={17, 254, 255, 256, 500, 510, 600, -1};
+    const int *headret[]={head1_1,head2_1,NULL};
+
+    fprintf(stderr,"testing basic page encoding... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* nil packets; beginning,middle,end */
+    const int packets[]={0,17, 254, 255, 0, 256, 0, 500, 510, 600, 0, -1};
+    const int *headret[]={head1_2,head2_2,NULL};
+
+    fprintf(stderr,"testing basic nil packets... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* large initial packet */
+    const int packets[]={4345,259,255,-1};
+    const int *headret[]={head1_3,head2_3,NULL};
+
+    fprintf(stderr,"testing initial-packet lacing > 4k... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* continuing packet test; with page spill expansion, we have to
+       overflow the lacing table. */
+    const int packets[]={0,65500,259,255,-1};
+    const int *headret[]={head1_4,head2_4,head3_4,NULL};
+
+    fprintf(stderr,"testing single packet page span... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* spill expand packet test */
+    const int packets[]={0,4345,259,255,0,0,-1};
+    const int *headret[]={head1_4b,head2_4b,head3_4b,NULL};
+
+    fprintf(stderr,"testing page spill expansion... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  /* page with the 255 segment limit */
+  {
+
+    const int packets[]={0,10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,10,
+                   10,10,10,10,10,10,10,50,-1};
+    const int *headret[]={head1_5,head2_5,head3_5,NULL};
+
+    fprintf(stderr,"testing max packet segments... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* packet that overspans over an entire page */
+    const int packets[]={0,100,130049,259,255,-1};
+    const int *headret[]={head1_6,head2_6,head3_6,head4_6,NULL};
+
+    fprintf(stderr,"testing very large packets... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+  {
+    /* test for the libogg 1.1.1 resync in large continuation bug
+       found by Josh Coalson)  */
+    const int packets[]={0,100,130049,259,255,-1};
+    const int *headret[]={head1_6,head2_6,head3_6,head4_6,NULL};
+
+    fprintf(stderr,"testing continuation resync in very large packets... ");
+    test_pack(packets,headret,100,2,3);
+  }
+
+  {
+    /* term only page.  why not? */
+    const int packets[]={0,100,64770,-1};
+    const int *headret[]={head1_7,head2_7,head3_7,NULL};
+
+    fprintf(stderr,"testing zero data page (1 nil packet)... ");
+    test_pack(packets,headret,0,0,0);
+  }
+
+
+
+  {
+    /* build a bunch of pages for testing */
+    unsigned char *data=_ogg_malloc(1024*1024);
+    int pl[]={0, 1,1,98,4079, 1,1,2954,2057, 76,34,912,0,234,1000,1000, 1000,300,-1};
+    int inptr=0,i,j;
+    ogg_page og[5];
+
+    ogg_stream_reset(&os_en);
+
+    for(i=0;pl[i]!=-1;i++){
+      ogg_packet op;
+      int len=pl[i];
+
+      op.packet=data+inptr;
+      op.bytes=len;
+      op.e_o_s=(pl[i+1]<0?1:0);
+      op.granulepos=(i+1)*1000;
+
+      for(j=0;j<len;j++)data[inptr++]=i+j;
+      ogg_stream_packetin(&os_en,&op);
+    }
+
+    _ogg_free(data);
+
+    /* retrieve finished pages */
+    for(i=0;i<5;i++){
+      if(ogg_stream_pageout(&os_en,&og[i])==0){
+        fprintf(stderr,"Too few pages output building sync tests!\n");
+        exit(1);
+      }
+      copy_page(&og[i]);
+    }
+
+    /* Test lost pages on pagein/packetout: no rollback */
+    {
+      ogg_page temp;
+      ogg_packet test;
+
+      fprintf(stderr,"Testing loss of pages... ");
+
+      ogg_sync_reset(&oy);
+      ogg_stream_reset(&os_de);
+      for(i=0;i<5;i++){
+        memcpy(ogg_sync_buffer(&oy,og[i].header_len),og[i].header,
+               og[i].header_len);
+        ogg_sync_wrote(&oy,og[i].header_len);
+        memcpy(ogg_sync_buffer(&oy,og[i].body_len),og[i].body,og[i].body_len);
+        ogg_sync_wrote(&oy,og[i].body_len);
+      }
+
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+      ogg_sync_pageout(&oy,&temp);
+      /* skip */
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+
+      /* do we get the expected results/packets? */
+
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,0,0,0);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,1,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,2,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,98,3,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,4079,4,5000);
+      if(ogg_stream_packetout(&os_de,&test)!=-1){
+        fprintf(stderr,"Error: loss of page did not return error\n");
+        exit(1);
+      }
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,76,9,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,34,10,-1);
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* Test lost pages on pagein/packetout: rollback with continuation */
+    {
+      ogg_page temp;
+      ogg_packet test;
+
+      fprintf(stderr,"Testing loss of pages (rollback required)... ");
+
+      ogg_sync_reset(&oy);
+      ogg_stream_reset(&os_de);
+      for(i=0;i<5;i++){
+        memcpy(ogg_sync_buffer(&oy,og[i].header_len),og[i].header,
+               og[i].header_len);
+        ogg_sync_wrote(&oy,og[i].header_len);
+        memcpy(ogg_sync_buffer(&oy,og[i].body_len),og[i].body,og[i].body_len);
+        ogg_sync_wrote(&oy,og[i].body_len);
+      }
+
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+      ogg_sync_pageout(&oy,&temp);
+      /* skip */
+      ogg_sync_pageout(&oy,&temp);
+      ogg_stream_pagein(&os_de,&temp);
+
+      /* do we get the expected results/packets? */
+
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,0,0,0);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,1,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,2,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,98,3,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,4079,4,5000);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,5,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,1,6,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,2954,7,-1);
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,2057,8,9000);
+      if(ogg_stream_packetout(&os_de,&test)!=-1){
+        fprintf(stderr,"Error: loss of page did not return error\n");
+        exit(1);
+      }
+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
+      checkpacket(&test,300,17,18000);
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* the rest only test sync */
+    {
+      ogg_page og_de;
+      /* Test fractional page inputs: incomplete capture */
+      fprintf(stderr,"Testing sync on partial inputs... ");
+      ogg_sync_reset(&oy);
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
+             3);
+      ogg_sync_wrote(&oy,3);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      /* Test fractional page inputs: incomplete fixed header */
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+3,
+             20);
+      ogg_sync_wrote(&oy,20);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      /* Test fractional page inputs: incomplete header */
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+23,
+             5);
+      ogg_sync_wrote(&oy,5);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      /* Test fractional page inputs: incomplete body */
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+28,
+             og[1].header_len-28);
+      ogg_sync_wrote(&oy,og[1].header_len-28);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,1000);
+      ogg_sync_wrote(&oy,1000);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body+1000,
+             og[1].body_len-1000);
+      ogg_sync_wrote(&oy,og[1].body_len-1000);
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* Test fractional page inputs: page + incomplete capture */
+    {
+      ogg_page og_de;
+      fprintf(stderr,"Testing sync on 1+partial inputs... ");
+      ogg_sync_reset(&oy);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
+             og[1].header_len);
+      ogg_sync_wrote(&oy,og[1].header_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
+             og[1].body_len);
+      ogg_sync_wrote(&oy,og[1].body_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
+             20);
+      ogg_sync_wrote(&oy,20);
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+20,
+             og[1].header_len-20);
+      ogg_sync_wrote(&oy,og[1].header_len-20);
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
+             og[1].body_len);
+      ogg_sync_wrote(&oy,og[1].body_len);
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* Test recapture: garbage + page */
+    {
+      ogg_page og_de;
+      fprintf(stderr,"Testing search for capture... ");
+      ogg_sync_reset(&oy);
+
+      /* 'garbage' */
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
+             og[1].body_len);
+      ogg_sync_wrote(&oy,og[1].body_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
+             og[1].header_len);
+      ogg_sync_wrote(&oy,og[1].header_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
+             og[1].body_len);
+      ogg_sync_wrote(&oy,og[1].body_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
+             20);
+      ogg_sync_wrote(&oy,20);
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+
+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header+20,
+             og[2].header_len-20);
+      ogg_sync_wrote(&oy,og[2].header_len-20);
+      memcpy(ogg_sync_buffer(&oy,og[2].body_len),og[2].body,
+             og[2].body_len);
+      ogg_sync_wrote(&oy,og[2].body_len);
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* Test recapture: page + garbage + page */
+    {
+      ogg_page og_de;
+      fprintf(stderr,"Testing recapture... ");
+      ogg_sync_reset(&oy);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
+             og[1].header_len);
+      ogg_sync_wrote(&oy,og[1].header_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
+             og[1].body_len);
+      ogg_sync_wrote(&oy,og[1].body_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
+             og[2].header_len);
+      ogg_sync_wrote(&oy,og[2].header_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
+             og[2].header_len);
+      ogg_sync_wrote(&oy,og[2].header_len);
+
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+
+      memcpy(ogg_sync_buffer(&oy,og[2].body_len),og[2].body,
+             og[2].body_len-5);
+      ogg_sync_wrote(&oy,og[2].body_len-5);
+
+      memcpy(ogg_sync_buffer(&oy,og[3].header_len),og[3].header,
+             og[3].header_len);
+      ogg_sync_wrote(&oy,og[3].header_len);
+
+      memcpy(ogg_sync_buffer(&oy,og[3].body_len),og[3].body,
+             og[3].body_len);
+      ogg_sync_wrote(&oy,og[3].body_len);
+
+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
+
+      fprintf(stderr,"ok.\n");
+    }
+
+    /* Free page data that was previously copied */
+    {
+      for(i=0;i<5;i++){
+        free_page(&og[i]);
+      }
+    }
+  }
+
+  return(0);
+}
+
+#endif

+ 28 - 0
modules/theoraplayer/native/theora/COPYING

@@ -0,0 +1,28 @@
+Copyright (C) 2002-2009 Xiph.org Foundation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the Xiph.org Foundation nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 606 - 0
modules/theoraplayer/native/theora/include/theora/codec.h

@@ -0,0 +1,606 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
+
+ ********************************************************************/
+
+/**\mainpage
+ *
+ * \section intro Introduction
+ *
+ * This is the documentation for the <tt>libtheora</tt> C API.
+ *
+ * The \c libtheora package is the current reference
+ * implementation for <a href="http://www.theora.org/">Theora</a>, a free,
+ * patent-unencumbered video codec.
+ * Theora is derived from On2's VP3 codec with additional features and
+ *  integration with Ogg multimedia formats by
+ *  <a href="http://www.xiph.org/">the Xiph.Org Foundation</a>.
+ * Complete documentation of the format itself is available in
+ * <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *  specification</a>.
+ *
+ * \section Organization
+ *
+ * The functions documented here are divided between two
+ * separate libraries:
+ * - \c libtheoraenc contains the encoder interface,
+ *   described in \ref encfuncs.
+ * - \c libtheoradec contains the decoder interface,
+ *   described in \ref decfuncs, \n
+ *   and additional \ref basefuncs.
+ *
+ * New code should link to \c libtheoradec. If using encoder
+ * features, it must also link to \c libtheoraenc.
+ *
+ * During initial development, prior to the 1.0 release,
+ * \c libtheora exported a different \ref oldfuncs which
+ * combined both encode and decode functions.
+ * In general, legacy API symbols can be indentified
+ * by their \c theora_ or \c OC_ namespace prefixes.
+ * The current API uses \c th_ or \c TH_ instead.
+ *
+ * While deprecated, \c libtheoraenc and \c libtheoradec
+ * together export the legacy api as well at the one documented above.
+ * Likewise, the legacy \c libtheora included with this package
+ * exports the new 1.x API. Older code and build scripts can therefore
+ * but updated independently to the current scheme.
+ */
+
+/**\file
+ * The shared <tt>libtheoradec</tt> and <tt>libtheoraenc</tt> C API.
+ * You don't need to include this directly.*/
+
+#if !defined(_O_THEORA_CODEC_H_)
+# define _O_THEORA_CODEC_H_ (1)
+# include <ogg/ogg.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+
+/**\name Return codes*/
+/*@{*/
+/**An invalid pointer was provided.*/
+#define TH_EFAULT     (-1)
+/**An invalid argument was provided.*/
+#define TH_EINVAL     (-10)
+/**The contents of the header were incomplete, invalid, or unexpected.*/
+#define TH_EBADHEADER (-20)
+/**The header does not belong to a Theora stream.*/
+#define TH_ENOTFORMAT (-21)
+/**The bitstream version is too high.*/
+#define TH_EVERSION   (-22)
+/**The specified function is not implemented.*/
+#define TH_EIMPL      (-23)
+/**There were errors in the video data packet.*/
+#define TH_EBADPACKET (-24)
+/**The decoded packet represented a dropped frame.
+   The player can continue to display the current frame, as the contents of the
+    decoded frame buffer have not changed.*/
+#define TH_DUPFRAME   (1)
+/*@}*/
+
+/**The currently defined color space tags.
+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *  specification</a>, Chapter 4, for exact details on the meaning
+ *  of each of these color spaces.*/
+typedef enum{
+  /**The color space was not specified at the encoder.
+      It may be conveyed by an external means.*/
+  TH_CS_UNSPECIFIED,
+  /**A color space designed for NTSC content.*/
+  TH_CS_ITU_REC_470M,
+  /**A color space designed for PAL/SECAM content.*/
+  TH_CS_ITU_REC_470BG,
+  /**The total number of currently defined color spaces.*/
+  TH_CS_NSPACES
+}th_colorspace;
+
+/**The currently defined pixel format tags.
+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *  specification</a>, Section 4.4, for details on the precise sample
+ *  locations.*/
+typedef enum{
+  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).
+     The Cb and Cr chroma planes are half the width and half the
+      height of the luma plane.*/
+  TH_PF_420,
+  /**Currently reserved.*/
+  TH_PF_RSVD,
+  /**Chroma decimation by 2 in the X direction (4:2:2).
+     The Cb and Cr chroma planes are half the width of the luma plane, but full
+      height.*/
+  TH_PF_422,
+  /**No chroma decimation (4:4:4).
+     The Cb and Cr chroma planes are full width and full height.*/
+  TH_PF_444,
+  /**The total number of currently defined pixel formats.*/
+  TH_PF_NFORMATS
+}th_pixel_fmt;
+
+
+
+/**A buffer for a single color plane in an uncompressed image.
+ * This contains the image data in a left-to-right, top-down format.
+ * Each row of pixels is stored contiguously in memory, but successive
+ *  rows need not be.
+ * Use \a stride to compute the offset of the next row.
+ * The encoder accepts both positive \a stride values (top-down in memory)
+ *  and negative (bottom-up in memory).
+ * The decoder currently always generates images with positive strides.*/
+typedef struct{
+  /**The width of this plane.*/
+  int            width;
+  /**The height of this plane.*/
+  int            height;
+  /**The offset in bytes between successive rows.*/
+  int            stride;
+  /**A pointer to the beginning of the first row.*/
+  unsigned char *data;
+}th_img_plane;
+
+/**A complete image buffer for an uncompressed frame.
+ * The chroma planes may be decimated by a factor of two in either
+ *  direction, as indicated by th_info#pixel_fmt.
+ * The width and height of the Y' plane must be multiples of 16.
+ * They may need to be cropped for display, using the rectangle
+ *  specified by th_info#pic_x, th_info#pic_y, th_info#pic_width,
+ *  and th_info#pic_height.
+ * All samples are 8 bits.
+ * \note The term YUV often used to describe a colorspace is ambiguous.
+ * The exact parameters of the RGB to YUV conversion process aside, in
+ *  many contexts the U and V channels actually have opposite meanings.
+ * To avoid this confusion, we are explicit: the name of the color
+ *  channels are Y'CbCr, and they appear in that order, always.
+ * The prime symbol denotes that the Y channel is non-linear.
+ * Cb and Cr stand for "Chroma blue" and "Chroma red", respectively.*/
+typedef th_img_plane th_ycbcr_buffer[3];
+
+/**Theora bitstream information.
+ * This contains the basic playback parameters for a stream, and corresponds to
+ *  the initial 'info' header packet.
+ * To initialize an encoder, the application fills in this structure and
+ *  passes it to th_encode_alloc().
+ * A default encoding mode is chosen based on the values of the #quality and
+ *  #target_bitrate fields.
+ * On decode, it is filled in by th_decode_headerin(), and then passed to
+ *  th_decode_alloc().
+ *
+ * Encoded Theora frames must be a multiple of 16 in size;
+ *  this is what the #frame_width and #frame_height members represent.
+ * To handle arbitrary picture sizes, a crop rectangle is specified in the
+ *  #pic_x, #pic_y, #pic_width and #pic_height members.
+ *
+ * All frame buffers contain pointers to the full, padded frame.
+ * However, the current encoder <em>will not</em> reference pixels outside of
+ *  the cropped picture region, and the application does not need to fill them
+ *  in.
+ * The decoder <em>will</em> allocate storage for a full frame, but the
+ *  application <em>should not</em> rely on the padding containing sensible
+ *  data.
+ *
+ * It is also generally recommended that the offsets and sizes should still be
+ *  multiples of 2 to avoid chroma sampling shifts when chroma is sub-sampled.
+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *  specification</a>, Section 4.4, for more details.
+ *
+ * Frame rate, in frames per second, is stored as a rational fraction, as is
+ *  the pixel aspect ratio.
+ * Note that this refers to the aspect ratio of the individual pixels, not of
+ *  the overall frame itself.
+ * The frame aspect ratio can be computed from pixel aspect ratio using the
+ *  image dimensions.*/
+typedef struct{
+  /**\name Theora version
+   * Bitstream version information.*/
+  /*@{*/
+  unsigned char version_major;
+  unsigned char version_minor;
+  unsigned char version_subminor;
+  /*@}*/
+  /**The encoded frame width.
+   * This must be a multiple of 16, and less than 1048576.*/
+  ogg_uint32_t  frame_width;
+  /**The encoded frame height.
+   * This must be a multiple of 16, and less than 1048576.*/
+  ogg_uint32_t  frame_height;
+  /**The displayed picture width.
+   * This must be no larger than width.*/
+  ogg_uint32_t  pic_width;
+  /**The displayed picture height.
+   * This must be no larger than height.*/
+  ogg_uint32_t  pic_height;
+  /**The X offset of the displayed picture.
+   * This must be no larger than #frame_width-#pic_width or 255, whichever is
+   *  smaller.*/
+  ogg_uint32_t  pic_x;
+  /**The Y offset of the displayed picture.
+   * This must be no larger than #frame_height-#pic_height, and
+   *  #frame_height-#pic_height-#pic_y must be no larger than 255.
+   * This slightly funny restriction is due to the fact that the offset is
+   *  specified from the top of the image for consistency with the standard
+   *  graphics left-handed coordinate system used throughout this API, while
+   *  it is stored in the encoded stream as an offset from the bottom.*/
+  ogg_uint32_t  pic_y;
+  /**\name Frame rate
+   * The frame rate, as a fraction.
+   * If either is 0, the frame rate is undefined.*/
+  /*@{*/
+  ogg_uint32_t  fps_numerator;
+  ogg_uint32_t  fps_denominator;
+  /*@}*/
+  /**\name Aspect ratio
+   * The aspect ratio of the pixels.
+   * If either value is zero, the aspect ratio is undefined.
+   * If not specified by any external means, 1:1 should be assumed.
+   * The aspect ratio of the full picture can be computed as
+   * \code
+   *  aspect_numerator*pic_width/(aspect_denominator*pic_height).
+   * \endcode */
+  /*@{*/
+  ogg_uint32_t  aspect_numerator;
+  ogg_uint32_t  aspect_denominator;
+  /*@}*/
+  /**The color space.*/
+  th_colorspace colorspace;
+  /**The pixel format.*/
+  th_pixel_fmt  pixel_fmt;
+  /**The target bit-rate in bits per second.
+     If initializing an encoder with this struct, set this field to a non-zero
+      value to activate CBR encoding by default.*/
+  int           target_bitrate;
+  /**The target quality level.
+     Valid values range from 0 to 63, inclusive, with higher values giving
+      higher quality.
+     If initializing an encoder with this struct, and #target_bitrate is set
+      to zero, VBR encoding at this quality will be activated by default.*/
+  /*Currently this is set so that a qi of 0 corresponds to distortions of 24
+     times the JND, and each increase by 16 halves that value.
+    This gives us fine discrimination at low qualities, yet effective rate
+     control at high qualities.
+    The qi value 63 is special, however.
+    For this, the highest quality, we use one half of a JND for our threshold.
+    Due to the lower bounds placed on allowable quantizers in Theora, we will
+     not actually be able to achieve quality this good, but this should
+     provide as close to visually lossless quality as Theora is capable of.
+    We could lift the quantizer restrictions without breaking VP3.1
+     compatibility, but this would result in quantized coefficients that are
+     too large for the current bitstream to be able to store.
+    We'd have to redesign the token syntax to store these large coefficients,
+     which would make transcoding complex.*/
+  int           quality;
+  /**The amount to shift to extract the last keyframe number from the granule
+   *  position.
+   * This can be at most 31.
+   * th_info_init() will set this to a default value (currently <tt>6</tt>,
+   *  which is good for streaming applications), but you can set it to 0 to
+   *  make every frame a keyframe.
+   * The maximum distance between key frames is
+   *  <tt>1<<#keyframe_granule_shift</tt>.
+   * The keyframe frequency can be more finely controlled with
+   *  #TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE, which can also be adjusted
+   *  during encoding (for example, to force the next frame to be a keyframe),
+   *  but it cannot be set larger than the amount permitted by this field after
+   *  the headers have been output.*/
+  int           keyframe_granule_shift;
+}th_info;
+
+/**The comment information.
+ *
+ * This structure holds the in-stream metadata corresponding to
+ *  the 'comment' header packet.
+ * The comment header is meant to be used much like someone jotting a quick
+ *  note on the label of a video.
+ * It should be a short, to the point text note that can be more than a couple
+ *  words, but not more than a short paragraph.
+ *
+ * The metadata is stored as a series of (tag, value) pairs, in
+ *  length-encoded string vectors.
+ * The first occurrence of the '=' character delimits the tag and value.
+ * A particular tag may occur more than once, and order is significant.
+ * The character set encoding for the strings is always UTF-8, but the tag
+ *  names are limited to ASCII, and treated as case-insensitive.
+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *  specification</a>, Section 6.3.3 for details.
+ *
+ * In filling in this structure, th_decode_headerin() will null-terminate
+ *  the user_comment strings for safety.
+ * However, the bitstream format itself treats them as 8-bit clean vectors,
+ *  possibly containing null characters, and so the length array should be
+ *  treated as their authoritative length.
+ */
+typedef struct th_comment{
+  /**The array of comment string vectors.*/
+  char **user_comments;
+  /**An array of the corresponding length of each vector, in bytes.*/
+  int   *comment_lengths;
+  /**The total number of comment strings.*/
+  int    comments;
+  /**The null-terminated vendor string.
+     This identifies the software used to encode the stream.*/
+  char  *vendor;
+}th_comment;
+
+
+
+/**A single base matrix.*/
+typedef unsigned char th_quant_base[64];
+
+/**A set of \a qi ranges.*/
+typedef struct{
+  /**The number of ranges in the set.*/
+  int                  nranges;
+  /**The size of each of the #nranges ranges.
+     These must sum to 63.*/
+  const int           *sizes;
+  /**#nranges <tt>+1</tt> base matrices.
+     Matrices \a i and <tt>i+1</tt> form the endpoints of range \a i.*/
+  const th_quant_base *base_matrices;
+}th_quant_ranges;
+
+/**A complete set of quantization parameters.
+   The quantizer for each coefficient is calculated as:
+   \code
+    Q=MAX(MIN(qmin[qti][ci!=0],scale[ci!=0][qi]*base[qti][pli][qi][ci]/100),
+     1024).
+   \endcode
+
+   \a qti is the quantization type index: 0 for intra, 1 for inter.
+   <tt>ci!=0</tt> is 0 for the DC coefficient and 1 for AC coefficients.
+   \a qi is the quality index, ranging between 0 (low quality) and 63 (high
+    quality).
+   \a pli is the color plane index: 0 for Y', 1 for Cb, 2 for Cr.
+   \a ci is the DCT coefficient index.
+   Coefficient indices correspond to the normal 2D DCT block
+    ordering--row-major with low frequencies first--\em not zig-zag order.
+
+   Minimum quantizers are constant, and are given by:
+   \code
+   qmin[2][2]={{4,2},{8,4}}.
+   \endcode
+
+   Parameters that can be stored in the bitstream are as follows:
+    - The two scale matrices ac_scale and dc_scale.
+      \code
+      scale[2][64]={dc_scale,ac_scale}.
+      \endcode
+    - The base matrices for each \a qi, \a qti and \a pli (up to 384 in all).
+      In order to avoid storing a full 384 base matrices, only a sparse set of
+       matrices are stored, and the rest are linearly interpolated.
+      This is done as follows.
+      For each \a qti and \a pli, a series of \a n \a qi ranges is defined.
+      The size of each \a qi range can vary arbitrarily, but they must sum to
+       63.
+      Then, <tt>n+1</tt> matrices are specified, one for each endpoint of the
+       ranges.
+      For interpolation purposes, each range's endpoints are the first \a qi
+       value it contains and one past the last \a qi value it contains.
+      Fractional values are rounded to the nearest integer, with ties rounded
+       away from zero.
+
+      Base matrices are stored by reference, so if the same matrices are used
+       multiple times, they will only appear once in the bitstream.
+      The bitstream is also capable of omitting an entire set of ranges and
+       its associated matrices if they are the same as either the previous
+       set (indexed in row-major order) or if the inter set is the same as the
+       intra set.
+
+    - Loop filter limit values.
+      The same limits are used for the loop filter in all color planes, despite
+       potentially differing levels of quantization in each.
+
+   For the current encoder, <tt>scale[ci!=0][qi]</tt> must be no greater
+    than <tt>scale[ci!=0][qi-1]</tt> and <tt>base[qti][pli][qi][ci]</tt> must
+    be no greater than <tt>base[qti][pli][qi-1][ci]</tt>.
+   These two conditions ensure that the actual quantizer for a given \a qti,
+    \a pli, and \a ci does not increase as \a qi increases.
+   This is not required by the decoder.*/
+typedef struct{
+  /**The DC scaling factors.*/
+  ogg_uint16_t    dc_scale[64];
+  /**The AC scaling factors.*/
+  ogg_uint16_t    ac_scale[64];
+  /**The loop filter limit values.*/
+  unsigned char   loop_filter_limits[64];
+  /**The \a qi ranges for each \a ci and \a pli.*/
+  th_quant_ranges qi_ranges[2][3];
+}th_quant_info;
+
+
+
+/**The number of Huffman tables used by Theora.*/
+#define TH_NHUFFMAN_TABLES (80)
+/**The number of DCT token values in each table.*/
+#define TH_NDCT_TOKENS     (32)
+
+/**A Huffman code for a Theora DCT token.
+ * Each set of Huffman codes in a given table must form a complete, prefix-free
+ *  code.
+ * There is no requirement that all the tokens in a table have a valid code,
+ *  but the current encoder is not optimized to take advantage of this.
+ * If each of the five grouops of 16 tables does not contain at least one table
+ *  with a code for every token, then the encoder may fail to encode certain
+ *  frames.
+ * The complete table in the first group of 16 does not have to be in the same
+ *  place as the complete table in the other groups, but the complete tables in
+ *  the remaining four groups must all be in the same place.*/
+typedef struct{
+  /**The bit pattern for the code, with the LSbit of the pattern aligned in
+   *   the LSbit of the word.*/
+  ogg_uint32_t pattern;
+  /**The number of bits in the code.
+   * This must be between 0 and 32, inclusive.*/
+  int          nbits;
+}th_huff_code;
+
+
+
+/**\defgroup basefuncs Functions Shared by Encode and Decode*/
+/*@{*/
+/**\name Basic shared functions
+ * These functions return information about the library itself,
+ * or provide high-level information about codec state
+ * and packet type.
+ *
+ * You must link to \c libtheoradec if you use any of the
+ * functions in this section.*/
+/*@{*/
+/**Retrieves a human-readable string to identify the library vendor and
+ *  version.
+ * \return the version string.*/
+extern const char *th_version_string(void);
+/**Retrieves the library version number.
+ * This is the highest bitstream version that the encoder library will produce,
+ *  or that the decoder library can decode.
+ * This number is composed of a 16-bit major version, 8-bit minor version
+ * and 8 bit sub-version, composed as follows:
+ * \code
+ * (VERSION_MAJOR<<16)+(VERSION_MINOR<<8)+(VERSION_SUBMINOR)
+ * \endcode
+ * \return the version number.*/
+extern ogg_uint32_t th_version_number(void);
+/**Converts a granule position to an absolute frame index, starting at
+ *  <tt>0</tt>.
+ * The granule position is interpreted in the context of a given
+ *  #th_enc_ctx or #th_dec_ctx handle (either will suffice).
+ * \param _encdec  A previously allocated #th_enc_ctx or #th_dec_ctx
+ *                  handle.
+ * \param _granpos The granule position to convert.
+ * \returns The absolute frame index corresponding to \a _granpos.
+ * \retval -1 The given granule position was invalid (i.e. negative).*/
+extern ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos);
+/**Converts a granule position to an absolute time in seconds.
+ * The granule position is interpreted in the context of a given
+ *  #th_enc_ctx or #th_dec_ctx handle (either will suffice).
+ * \param _encdec  A previously allocated #th_enc_ctx or #th_dec_ctx
+ *                  handle.
+ * \param _granpos The granule position to convert.
+ * \return The absolute time in seconds corresponding to \a _granpos.
+ *         This is the "end time" for the frame, or the latest time it should
+ *          be displayed.
+ *         It is not the presentation time.
+ * \retval -1 The given granule position was invalid (i.e. negative).*/
+extern double th_granule_time(void *_encdec,ogg_int64_t _granpos);
+/**Determines whether a Theora packet is a header or not.
+ * This function does no verification beyond checking the packet type bit, so
+ *  it should not be used for bitstream identification; use
+ *  th_decode_headerin() for that.
+ * As per the Theora specification, an empty (0-byte) packet is treated as a
+ *  data packet (a delta frame with no coded blocks).
+ * \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
+ * \retval 1 The packet is a header packet
+ * \retval 0 The packet is a video data packet.*/
+extern int th_packet_isheader(ogg_packet *_op);
+/**Determines whether a theora packet is a key frame or not.
+ * This function does no verification beyond checking the packet type and
+ *  key frame bits, so it should not be used for bitstream identification; use
+ *  th_decode_headerin() for that.
+ * As per the Theora specification, an empty (0-byte) packet is treated as a
+ *  delta frame (with no coded blocks).
+ * \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
+ * \retval 1  The packet contains a key frame.
+ * \retval 0  The packet contains a delta frame.
+ * \retval -1 The packet is not a video data packet.*/
+extern int th_packet_iskeyframe(ogg_packet *_op);
+/*@}*/
+
+
+/**\name Functions for manipulating header data
+ * These functions manipulate the #th_info and #th_comment structures
+ * which describe video parameters and key-value metadata, respectively.
+ *
+ * You must link to \c libtheoradec if you use any of the
+ * functions in this section.*/
+/*@{*/
+/**Initializes a th_info structure.
+ * This should be called on a freshly allocated #th_info structure before
+ *  attempting to use it.
+ * \param _info The #th_info struct to initialize.*/
+extern void th_info_init(th_info *_info);
+/**Clears a #th_info structure.
+ * This should be called on a #th_info structure after it is no longer
+ *  needed.
+ * \param _info The #th_info struct to clear.*/
+extern void th_info_clear(th_info *_info);
+
+/**Initialize a #th_comment structure.
+ * This should be called on a freshly allocated #th_comment structure
+ *  before attempting to use it.
+ * \param _tc The #th_comment struct to initialize.*/
+extern void th_comment_init(th_comment *_tc);
+/**Add a comment to an initialized #th_comment structure.
+ * \note Neither th_comment_add() nor th_comment_add_tag() support
+ *  comments containing null values, although the bitstream format does
+ *  support them.
+ * To add such comments you will need to manipulate the #th_comment
+ *  structure directly.
+ * \param _tc      The #th_comment struct to add the comment to.
+ * \param _comment Must be a null-terminated UTF-8 string containing the
+ *                  comment in "TAG=the value" form.*/
+extern void th_comment_add(th_comment *_tc,const char *_comment);
+/**Add a comment to an initialized #th_comment structure.
+ * \note Neither th_comment_add() nor th_comment_add_tag() support
+ *  comments containing null values, although the bitstream format does
+ *  support them.
+ * To add such comments you will need to manipulate the #th_comment
+ *  structure directly.
+ * \param _tc  The #th_comment struct to add the comment to.
+ * \param _tag A null-terminated string containing the tag  associated with
+ *              the comment.
+ * \param _val The corresponding value as a null-terminated string.*/
+extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
+ const char *_val);
+/**Look up a comment value by its tag.
+ * \param _tc    An initialized #th_comment structure.
+ * \param _tag   The tag to look up.
+ * \param _count The instance of the tag.
+ *               The same tag can appear multiple times, each with a distinct
+ *                value, so an index is required to retrieve them all.
+ *               The order in which these values appear is significant and
+ *                should be preserved.
+ *               Use th_comment_query_count() to get the legal range for
+ *                the \a _count parameter.
+ * \return A pointer to the queried tag's value.
+ *         This points directly to data in the #th_comment structure.
+ *         It should not be modified or freed by the application, and
+ *          modifications to the structure may invalidate the pointer.
+ * \retval NULL If no matching tag is found.*/
+extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
+/**Look up the number of instances of a tag.
+ * Call this first when querying for a specific tag and then iterate over the
+ *  number of instances with separate calls to th_comment_query() to
+ *  retrieve all the values for that tag in order.
+ * \param _tc    An initialized #th_comment structure.
+ * \param _tag   The tag to look up.
+ * \return The number on instances of this particular tag.*/
+extern int th_comment_query_count(th_comment *_tc,const char *_tag);
+/**Clears a #th_comment structure.
+ * This should be called on a #th_comment structure after it is no longer
+ *  needed.
+ * It will free all memory used by the structure members.
+ * \param _tc The #th_comment struct to clear.*/
+extern void th_comment_clear(th_comment *_tc);
+/*@}*/
+/*@}*/
+
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif

+ 786 - 0
modules/theoraplayer/native/theora/include/theora/theora.h

@@ -0,0 +1,786 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: theora.h,v 1.17 2003/12/06 18:06:19 arc Exp $
+
+ ********************************************************************/
+
+#ifndef _O_THEORA_H_
+#define _O_THEORA_H_
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+#include <stddef.h>	/* for size_t */
+
+#include <ogg/ogg.h>
+
+/** \file
+ * The libtheora pre-1.0 legacy C API.
+ *
+ * \ingroup oldfuncs
+ *
+ * \section intro Introduction
+ *
+ * This is the documentation for the libtheora legacy C API, declared in
+ * the theora.h header, which describes the old interface used before
+ * the 1.0 release. This API was widely deployed for several years and
+ * remains supported, but for new code we recommend the cleaner API
+ * declared in theoradec.h and theoraenc.h.
+ *
+ * libtheora is the reference implementation for
+ * <a href="http://www.theora.org/">Theora</a>, a free video codec.
+ * Theora is derived from On2's VP3 codec with improved integration with
+ * Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
+ *
+ * \section overview Overview
+ *
+ * This library will both decode and encode theora packets to/from raw YUV
+ * frames.  In either case, the packets will most likely either come from or
+ * need to be embedded in an Ogg stream.  Use
+ * <a href="http://xiph.org/ogg/">libogg</a> or
+ * <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
+ * to extract/package these packets.
+ *
+ * \section decoding Decoding Process
+ *
+ * Decoding can be separated into the following steps:
+ * -# initialise theora_info and theora_comment structures using
+ *    theora_info_init() and theora_comment_init():
+ \verbatim
+ theora_info     info;
+ theora_comment  comment;
+
+ theora_info_init(&info);
+ theora_comment_init(&comment);
+ \endverbatim
+ * -# retrieve header packets from Ogg stream (there should be 3) and decode
+ *    into theora_info and theora_comment structures using
+ *    theora_decode_header().  See \ref identification for more information on
+ *    identifying which packets are theora packets.
+ \verbatim
+ int i;
+ for (i = 0; i < 3; i++)
+ {
+   (get a theora packet "op" from the Ogg stream)
+   theora_decode_header(&info, &comment, op);
+ }
+ \endverbatim
+ * -# initialise the decoder based on the information retrieved into the
+ *    theora_info struct by theora_decode_header().  You will need a
+ *    theora_state struct.
+ \verbatim
+ theora_state state;
+
+ theora_decode_init(&state, &info);
+ \endverbatim
+ * -# pass in packets and retrieve decoded frames!  See the yuv_buffer
+ *    documentation for information on how to retrieve raw YUV data.
+ \verbatim
+ yuf_buffer buffer;
+ while (last packet was not e_o_s) {
+   (get a theora packet "op" from the Ogg stream)
+   theora_decode_packetin(&state, op);
+   theora_decode_YUVout(&state, &buffer);
+ }
+ \endverbatim
+ *
+ *
+ * \subsection identification Identifying Theora Packets
+ *
+ * All streams inside an Ogg file have a unique serial_no attached to the
+ * stream.  Typically, you will want to
+ *  - retrieve the serial_no for each b_o_s (beginning of stream) page
+ *    encountered within the Ogg file;
+ *  - test the first (only) packet on that page to determine if it is a theora
+ *    packet;
+ *  - once you have found a theora b_o_s page then use the retrieved serial_no
+ *    to identify future packets belonging to the same theora stream.
+ *
+ * Note that you \e cannot use theora_packet_isheader() to determine if a
+ * packet is a theora packet or not, as this function does not perform any
+ * checking beyond whether a header bit is present.  Instead, use the
+ * theora_decode_header() function and check the return value; or examine the
+ * header bytes at the beginning of the Ogg page.
+ */
+
+
+/** \defgroup oldfuncs Legacy pre-1.0 C API */
+/*  @{ */
+
+/**
+ * A YUV buffer for passing uncompressed frames to and from the codec.
+ * This holds a Y'CbCr frame in planar format. The CbCr planes can be
+ * subsampled and have their own separate dimensions and row stride
+ * offsets. Note that the strides may be negative in some
+ * configurations. For theora the width and height of the largest plane
+ * must be a multiple of 16. The actual meaningful picture size and
+ * offset are stored in the theora_info structure; frames returned by
+ * the decoder may need to be cropped for display.
+ *
+ * All samples are 8 bits. Within each plane samples are ordered by
+ * row from the top of the frame to the bottom. Within each row samples
+ * are ordered from left to right.
+ *
+ * During decode, the yuv_buffer struct is allocated by the user, but all
+ * fields (including luma and chroma pointers) are filled by the library.
+ * These pointers address library-internal memory and their contents should
+ * not be modified.
+ *
+ * Conversely, during encode the user allocates the struct and fills out all
+ * fields.  The user also manages the data addressed by the luma and chroma
+ * pointers.  See the encoder_example.c and dump_video.c example files in
+ * theora/examples/ for more information.
+ */
+typedef struct {
+    int   y_width;      /**< Width of the Y' luminance plane */
+    int   y_height;     /**< Height of the luminance plane */
+    int   y_stride;     /**< Offset in bytes between successive rows */
+
+    int   uv_width;     /**< Width of the Cb and Cr chroma planes */
+    int   uv_height;    /**< Height of the chroma planes */
+    int   uv_stride;    /**< Offset between successive chroma rows */
+    unsigned char *y;   /**< Pointer to start of luminance data */
+    unsigned char *u;   /**< Pointer to start of Cb data */
+    unsigned char *v;   /**< Pointer to start of Cr data */
+
+} yuv_buffer;
+
+/**
+ * A Colorspace.
+ */
+typedef enum {
+  OC_CS_UNSPECIFIED,    /**< The colorspace is unknown or unspecified */
+  OC_CS_ITU_REC_470M,   /**< This is the best option for 'NTSC' content */
+  OC_CS_ITU_REC_470BG,  /**< This is the best option for 'PAL' content */
+  OC_CS_NSPACES         /**< This marks the end of the defined colorspaces */
+} theora_colorspace;
+
+/**
+ * A Chroma subsampling
+ *
+ * These enumerate the available chroma subsampling options supported
+ * by the theora format. See Section 4.4 of the specification for
+ * exact definitions.
+ */
+typedef enum {
+  OC_PF_420,    /**< Chroma subsampling by 2 in each direction (4:2:0) */
+  OC_PF_RSVD,   /**< Reserved value */
+  OC_PF_422,    /**< Horizonatal chroma subsampling by 2 (4:2:2) */
+  OC_PF_444     /**< No chroma subsampling at all (4:4:4) */
+} theora_pixelformat;
+
+/**
+ * Theora bitstream info.
+ * Contains the basic playback parameters for a stream,
+ * corresponding to the initial 'info' header packet.
+ *
+ * Encoded theora frames must be a multiple of 16 in width and height.
+ * To handle other frame sizes, a crop rectangle is specified in
+ * frame_height and frame_width, offset_x and * offset_y. The offset
+ * and size should still be a multiple of 2 to avoid chroma sampling
+ * shifts. Offset values in this structure are measured from the
+ * upper left of the image.
+ *
+ * Frame rate, in frames per second, is stored as a rational
+ * fraction. Aspect ratio is also stored as a rational fraction, and
+ * refers to the aspect ratio of the frame pixels, not of the
+ * overall frame itself.
+ *
+ * See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
+ * examples/encoder_example.c</a> for usage examples of the
+ * other parameters and good default settings for the encoder parameters.
+ */
+typedef struct {
+  ogg_uint32_t  width;		/**< encoded frame width  */
+  ogg_uint32_t  height;		/**< encoded frame height */
+  ogg_uint32_t  frame_width;	/**< display frame width  */
+  ogg_uint32_t  frame_height;	/**< display frame height */
+  ogg_uint32_t  offset_x;	/**< horizontal offset of the displayed frame */
+  ogg_uint32_t  offset_y;	/**< vertical offset of the displayed frame */
+  ogg_uint32_t  fps_numerator;	    /**< frame rate numerator **/
+  ogg_uint32_t  fps_denominator;    /**< frame rate denominator **/
+  ogg_uint32_t  aspect_numerator;   /**< pixel aspect ratio numerator */
+  ogg_uint32_t  aspect_denominator; /**< pixel aspect ratio denominator */
+  theora_colorspace colorspace;	    /**< colorspace */
+  int           target_bitrate;	    /**< nominal bitrate in bits per second */
+  int           quality;  /**< Nominal quality setting, 0-63 */
+  int           quick_p;  /**< Quick encode/decode */
+
+  /* decode only */
+  unsigned char version_major;
+  unsigned char version_minor;
+  unsigned char version_subminor;
+
+  void *codec_setup;
+
+  /* encode only */
+  int           dropframes_p;
+  int           keyframe_auto_p;
+  ogg_uint32_t  keyframe_frequency;
+  ogg_uint32_t  keyframe_frequency_force;  /* also used for decode init to
+                                              get granpos shift correct */
+  ogg_uint32_t  keyframe_data_target_bitrate;
+  ogg_int32_t   keyframe_auto_threshold;
+  ogg_uint32_t  keyframe_mindistance;
+  ogg_int32_t   noise_sensitivity;
+  ogg_int32_t   sharpness;
+
+  theora_pixelformat pixelformat;	/**< chroma subsampling mode to expect */
+
+} theora_info;
+
+/** Codec internal state and context.
+ */
+typedef struct{
+  theora_info *i;
+  ogg_int64_t granulepos;
+
+  void *internal_encode;
+  void *internal_decode;
+
+} theora_state;
+
+/**
+ * Comment header metadata.
+ *
+ * This structure holds the in-stream metadata corresponding to
+ * the 'comment' header packet.
+ *
+ * Meta data is stored as a series of (tag, value) pairs, in
+ * length-encoded string vectors. The first occurence of the
+ * '=' character delimits the tag and value. A particular tag
+ * may occur more than once. The character set encoding for
+ * the strings is always UTF-8, but the tag names are limited
+ * to case-insensitive ASCII. See the spec for details.
+ *
+ * In filling in this structure, theora_decode_header() will
+ * null-terminate the user_comment strings for safety. However,
+ * the bitstream format itself treats them as 8-bit clean,
+ * and so the length array should be treated as authoritative
+ * for their length.
+ */
+typedef struct theora_comment{
+  char **user_comments;         /**< An array of comment string vectors */
+  int   *comment_lengths;       /**< An array of corresponding string vector lengths in bytes */
+  int    comments;              /**< The total number of comment string vectors */
+  char  *vendor;                /**< The vendor string identifying the encoder, null terminated */
+
+} theora_comment;
+
+
+/**\name theora_control() codes */
+/* \anchor decctlcodes_old
+ * These are the available request codes for theora_control()
+ * when called with a decoder instance.
+ * By convention decoder control codes are odd, to distinguish
+ * them from \ref encctlcodes_old "encoder control codes" which
+ * are even.
+ *
+ * Note that since the 1.0 release, both the legacy and the final
+ * implementation accept all the same control codes, but only the
+ * final API declares the newer codes.
+ *
+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
+
+/*@{*/
+
+/**Get the maximum post-processing level.
+ * The decoder supports a post-processing filter that can improve
+ * the appearance of the decoded images. This returns the highest
+ * level setting for this post-processor, corresponding to maximum
+ * improvement and computational expense.
+ */
+#define TH_DECCTL_GET_PPLEVEL_MAX (1)
+
+/**Set the post-processing level.
+ * Sets the level of post-processing to use when decoding the
+ * compressed stream. This must be a value between zero (off)
+ * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
+ */
+#define TH_DECCTL_SET_PPLEVEL (3)
+
+/**Sets the maximum distance between key frames.
+ * This can be changed during an encode, but will be bounded by
+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
+ *  be enlarged appropriately.
+ *
+ * \param[in]  buf <tt>ogg_uint32_t</tt>: The maximum distance between key
+ *                   frames.
+ * \param[out] buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
+ * \retval OC_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
+
+/**Set the granule position.
+ * Call this after a seek, to update the internal granulepos
+ * in the decoder, to insure that subsequent frames are marked
+ * properly. If you track timestamps yourself and do not use
+ * the granule postion returned by the decoder, then you do
+ * not need to use this control.
+ */
+#define TH_DECCTL_SET_GRANPOS (5)
+
+/**\anchor encctlcodes_old */
+
+/**Sets the quantization parameters to use.
+ * The parameters are copied, not stored by reference, so they can be freed
+ *  after this call.
+ * <tt>NULL</tt> may be specified to revert to the default parameters.
+ *
+ * \param[in] buf #th_quant_info
+ * \retval OC_FAULT  \a theora_state is <tt>NULL</tt>.
+ * \retval OC_EINVAL Encoding has already begun, the quantization parameters
+ *                    are not acceptable to this version of the encoder,
+ *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
+ *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is
+ *                    not <tt>sizeof(#th_quant_info)</tt>.
+ * \retval OC_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_QUANT_PARAMS (2)
+
+/**Disables any encoder features that would prevent lossless transcoding back
+ *  to VP3.
+ * This primarily means disabling block-level QI values and not using 4MV mode
+ *  when any of the luma blocks in a macro block are not coded.
+ * It also includes using the VP3 quantization tables and Huffman codes; if you
+ *  set them explicitly after calling this function, the resulting stream will
+ *  not be VP3-compatible.
+ * If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
+ *  material, or when using a picture region smaller than the full frame (e.g.
+ *  a non-multiple-of-16 width or height), then non-VP3 bitstream features will
+ *  still be disabled, but the stream will still not be VP3-compatible, as VP3
+ *  was not capable of encoding such formats.
+ * If you call this after encoding has already begun, then the quantization
+ *  tables and codebooks cannot be changed, but the frame-level features will
+ *  be enabled or disabled as requested.
+ *
+ * \param[in]  buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
+ *                   or 0 to disable it (the default).
+ * \param[out] buf <tt>int</tt>: 1 if all bitstream features required for
+ *                   VP3-compatibility could be set, and 0 otherwise.
+ *                  The latter will be returned if the pixel format is not
+ *                   4:2:0, the picture region is smaller than the full frame,
+ *                   or if encoding has begun, preventing the quantization
+ *                   tables and codebooks from being set.
+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval OC_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
+
+/**Gets the maximum speed level.
+ * Higher speed levels favor quicker encoding over better quality per bit.
+ * Depending on the encoding mode, and the internal algorithms used, quality
+ *  may actually improve, but in this case bitrate will also likely increase.
+ * In any case, overall rate/distortion performance will probably decrease.
+ * The maximum value, and the meaning of each value, may change depending on
+ *  the current encoding mode (VBR vs. CQI, etc.).
+ *
+ * \param[out] buf int: The maximum encoding speed level.
+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval OC_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
+
+/**Sets the speed level.
+ * By default a speed value of 1 is used.
+ *
+ * \param[in] buf int: The new encoding speed level.
+ *                      0 is slowest, larger values use less CPU.
+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>, or the
+ *                    encoding speed level is out of bounds.
+ *                   The maximum encoding speed level may be
+ *                    implementation- and encoding mode-specific, and can be
+ *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
+ * \retval OC_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_SPLEVEL (14)
+
+/*@}*/
+
+#define OC_FAULT       -1       /**< General failure */
+#define OC_EINVAL      -10      /**< Library encountered invalid internal data */
+#define OC_DISABLED    -11      /**< Requested action is disabled */
+#define OC_BADHEADER   -20      /**< Header packet was corrupt/invalid */
+#define OC_NOTFORMAT   -21      /**< Packet is not a theora packet */
+#define OC_VERSION     -22      /**< Bitstream version is not handled */
+#define OC_IMPL        -23      /**< Feature or action not implemented */
+#define OC_BADPACKET   -24      /**< Packet is corrupt */
+#define OC_NEWPACKET   -25      /**< Packet is an (ignorable) unhandled extension */
+#define OC_DUPFRAME    1        /**< Packet is a dropped frame */
+
+/**
+ * Retrieve a human-readable string to identify the encoder vendor and version.
+ * \returns A version string.
+ */
+extern const char *theora_version_string(void);
+
+/**
+ * Retrieve a 32-bit version number.
+ * This number is composed of a 16-bit major version, 8-bit minor version
+ * and 8 bit sub-version, composed as follows:
+<pre>
+   (VERSION_MAJOR<<16) + (VERSION_MINOR<<8) + (VERSION_SUB)
+</pre>
+* \returns The version number.
+*/
+extern ogg_uint32_t theora_version_number(void);
+
+/**
+ * Initialize the theora encoder.
+ * \param th The theora_state handle to initialize for encoding.
+ * \param ti A theora_info struct filled with the desired encoding parameters.
+ * \retval 0 Success
+ */
+extern int theora_encode_init(theora_state *th, theora_info *ti);
+
+/**
+ * Submit a YUV buffer to the theora encoder.
+ * \param t A theora_state handle previously initialized for encoding.
+ * \param yuv A buffer of YUV data to encode.  Note that both the yuv_buffer
+ *            struct and the luma/chroma buffers within should be allocated by
+ *            the user.
+ * \retval OC_EINVAL Encoder is not ready, or is finished.
+ * \retval -1 The size of the given frame differs from those previously input
+ * \retval 0 Success
+ */
+extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
+
+/**
+ * Request the next packet of encoded video.
+ * The encoded data is placed in a user-provided ogg_packet structure.
+ * \param t A theora_state handle previously initialized for encoding.
+ * \param last_p whether this is the last packet the encoder should produce.
+ * \param op An ogg_packet structure to fill. libtheora will set all
+ *           elements of this structure, including a pointer to encoded
+ *           data. The memory for the encoded data is owned by libtheora.
+ * \retval 0 No internal storage exists OR no packet is ready
+ * \retval -1 The encoding process has completed
+ * \retval 1 Success
+ */
+extern int theora_encode_packetout( theora_state *t, int last_p,
+                                    ogg_packet *op);
+
+/**
+ * Request a packet containing the initial header.
+ * A pointer to the header data is placed in a user-provided ogg_packet
+ * structure.
+ * \param t A theora_state handle previously initialized for encoding.
+ * \param op An ogg_packet structure to fill. libtheora will set all
+ *           elements of this structure, including a pointer to the header
+ *           data. The memory for the header data is owned by libtheora.
+ * \retval 0 Success
+ */
+extern int theora_encode_header(theora_state *t, ogg_packet *op);
+
+/**
+ * Request a comment header packet from provided metadata.
+ * A pointer to the comment data is placed in a user-provided ogg_packet
+ * structure.
+ * \param tc A theora_comment structure filled with the desired metadata
+ * \param op An ogg_packet structure to fill. libtheora will set all
+ *           elements of this structure, including a pointer to the encoded
+ *           comment data. The memory for the comment data is owned by
+ *           the application, and must be freed by it using _ogg_free().
+ *           On some systems (such as Windows when using dynamic linking), this
+ *           may mean the free is executed in a different module from the
+ *           malloc, which will crash; there is no way to free this memory on
+ *           such systems.
+ * \retval 0 Success
+ */
+extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
+
+/**
+ * Request a packet containing the codebook tables for the stream.
+ * A pointer to the codebook data is placed in a user-provided ogg_packet
+ * structure.
+ * \param t A theora_state handle previously initialized for encoding.
+ * \param op An ogg_packet structure to fill. libtheora will set all
+ *           elements of this structure, including a pointer to the codebook
+ *           data. The memory for the header data is owned by libtheora.
+ * \retval 0 Success
+ */
+extern int theora_encode_tables(theora_state *t, ogg_packet *op);
+
+/**
+ * Decode an Ogg packet, with the expectation that the packet contains
+ * an initial header, comment data or codebook tables.
+ *
+ * \param ci A theora_info structure to fill. This must have been previously
+ *           initialized with theora_info_init(). If \a op contains an initial
+ *           header, theora_decode_header() will fill \a ci with the
+ *           parsed header values. If \a op contains codebook tables,
+ *           theora_decode_header() will parse these and attach an internal
+ *           representation to \a ci->codec_setup.
+ * \param cc A theora_comment structure to fill. If \a op contains comment
+ *           data, theora_decode_header() will fill \a cc with the parsed
+ *           comments.
+ * \param op An ogg_packet structure which you expect contains an initial
+ *           header, comment data or codebook tables.
+ *
+ * \retval OC_BADHEADER \a op is NULL; OR the first byte of \a op->packet
+ *                      has the signature of an initial packet, but op is
+ *                      not a b_o_s packet; OR this packet has the signature
+ *                      of an initial header packet, but an initial header
+ *                      packet has already been seen; OR this packet has the
+ *                      signature of a comment packet, but the initial header
+ *                      has not yet been seen; OR this packet has the signature
+ *                      of a comment packet, but contains invalid data; OR
+ *                      this packet has the signature of codebook tables,
+ *                      but the initial header or comments have not yet
+ *                      been seen; OR this packet has the signature of codebook
+ *                      tables, but contains invalid data;
+ *                      OR the stream being decoded has a compatible version
+ *                      but this packet does not have the signature of a
+ *                      theora initial header, comments, or codebook packet
+ * \retval OC_VERSION   The packet data of \a op is an initial header with
+ *                      a version which is incompatible with this version of
+ *                      libtheora.
+ * \retval OC_NEWPACKET the stream being decoded has an incompatible (future)
+ *                      version and contains an unknown signature.
+ * \retval 0            Success
+ *
+ * \note The normal usage is that theora_decode_header() be called on the
+ *       first three packets of a theora logical bitstream in succession.
+ */
+extern int theora_decode_header(theora_info *ci, theora_comment *cc,
+                                ogg_packet *op);
+
+/**
+ * Initialize a theora_state handle for decoding.
+ * \param th The theora_state handle to initialize.
+ * \param c  A theora_info struct filled with the desired decoding parameters.
+ *           This is of course usually obtained from a previous call to
+ *           theora_decode_header().
+ * \retval 0 Success
+ */
+extern int theora_decode_init(theora_state *th, theora_info *c);
+
+/**
+ * Input a packet containing encoded data into the theora decoder.
+ * \param th A theora_state handle previously initialized for decoding.
+ * \param op An ogg_packet containing encoded theora data.
+ * \retval 0 Success
+ * \retval OC_BADPACKET \a op does not contain encoded video data
+ */
+extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
+
+/**
+ * Output the next available frame of decoded YUV data.
+ * \param th A theora_state handle previously initialized for decoding.
+ * \param yuv A yuv_buffer in which libtheora should place the decoded data.
+ *            Note that the buffer struct itself is allocated by the user, but
+ *            that the luma and chroma pointers will be filled in by the
+ *            library.  Also note that these luma and chroma regions should be
+ *            considered read-only by the user.
+ * \retval 0 Success
+ */
+extern int theora_decode_YUVout(theora_state *th,yuv_buffer *yuv);
+
+/**
+ * Report whether a theora packet is a header or not
+ * This function does no verification beyond checking the header
+ * flag bit so it should not be used for bitstream identification;
+ * use theora_decode_header() for that.
+ *
+ * \param op An ogg_packet containing encoded theora data.
+ * \retval 1 The packet is a header packet
+ * \retval 0 The packet is not a header packet (and so contains frame data)
+ *
+ * Thus function was added in the 1.0alpha4 release.
+ */
+extern int theora_packet_isheader(ogg_packet *op);
+
+/**
+ * Report whether a theora packet is a keyframe or not
+ *
+ * \param op An ogg_packet containing encoded theora data.
+ * \retval 1 The packet contains a keyframe image
+ * \retval 0 The packet is contains an interframe delta
+ * \retval -1 The packet is not an image data packet at all
+ *
+ * Thus function was added in the 1.0alpha4 release.
+ */
+extern int theora_packet_iskeyframe(ogg_packet *op);
+
+/**
+ * Report the granulepos shift radix
+ *
+ * When embedded in Ogg, Theora uses a two-part granulepos,
+ * splitting the 64-bit field into two pieces. The more-significant
+ * section represents the frame count at the last keyframe,
+ * and the less-significant section represents the count of
+ * frames since the last keyframe. In this way the overall
+ * field is still non-decreasing with time, but usefully encodes
+ * a pointer to the last keyframe, which is necessary for
+ * correctly restarting decode after a seek.
+ *
+ * This function reports the number of bits used to represent
+ * the distance to the last keyframe, and thus how the granulepos
+ * field must be shifted or masked to obtain the two parts.
+ *
+ * Since libtheora returns compressed data in an ogg_packet
+ * structure, this may be generally useful even if the Theora
+ * packets are not being used in an Ogg container.
+ *
+ * \param ti A previously initialized theora_info struct
+ * \returns The bit shift dividing the two granulepos fields
+ *
+ * This function was added in the 1.0alpha5 release.
+ */
+int theora_granule_shift(theora_info *ti);
+
+/**
+ * Convert a granulepos to an absolute frame index, starting at 0.
+ * The granulepos is interpreted in the context of a given theora_state handle.
+ *
+ * Note that while the granulepos encodes the frame count (i.e. starting
+ * from 1) this call returns the frame index, starting from zero. Thus
+ * One can calculate the presentation time by multiplying the index by
+ * the rate.
+ *
+ * \param th A previously initialized theora_state handle (encode or decode)
+ * \param granulepos The granulepos to convert.
+ * \returns The frame index corresponding to \a granulepos.
+ * \retval -1 The given granulepos is undefined (i.e. negative)
+ *
+ * Thus function was added in the 1.0alpha4 release.
+ */
+extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos);
+
+/**
+ * Convert a granulepos to absolute time in seconds. The granulepos is
+ * interpreted in the context of a given theora_state handle, and gives
+ * the end time of a frame's presentation as used in Ogg mux ordering.
+ *
+ * \param th A previously initialized theora_state handle (encode or decode)
+ * \param granulepos The granulepos to convert.
+ * \returns The absolute time in seconds corresponding to \a granulepos.
+ *          This is the "end time" for the frame, or the latest time it should
+ *           be displayed.
+ *          It is not the presentation time.
+ * \retval -1. The given granulepos is undefined (i.e. negative).
+ */
+extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
+
+/**
+ * Initialize a theora_info structure. All values within the given theora_info
+ * structure are initialized, and space is allocated within libtheora for
+ * internal codec setup data.
+ * \param c A theora_info struct to initialize.
+ */
+extern void theora_info_init(theora_info *c);
+
+/**
+ * Clear a theora_info structure. All values within the given theora_info
+ * structure are cleared, and associated internal codec setup data is freed.
+ * \param c A theora_info struct to initialize.
+ */
+extern void theora_info_clear(theora_info *c);
+
+/**
+ * Free all internal data associated with a theora_state handle.
+ * \param t A theora_state handle.
+ */
+extern void theora_clear(theora_state *t);
+
+/**
+ * Initialize an allocated theora_comment structure
+ * \param tc An allocated theora_comment structure
+ **/
+extern void theora_comment_init(theora_comment *tc);
+
+/**
+ * Add a comment to an initialized theora_comment structure
+ * \param tc A previously initialized theora comment structure
+ * \param comment A null-terminated string encoding the comment in the form
+ *                "TAG=the value"
+ *
+ * Neither theora_comment_add() nor theora_comment_add_tag() support
+ * comments containing null values, although the bitstream format
+ * supports this. To add such comments you will need to manipulate
+ * the theora_comment structure directly.
+ **/
+
+extern void theora_comment_add(theora_comment *tc, char *comment);
+
+/**
+ * Add a comment to an initialized theora_comment structure.
+ * \param tc A previously initialized theora comment structure
+ * \param tag A null-terminated string containing the tag
+ *            associated with the comment.
+ * \param value The corresponding value as a null-terminated string
+ *
+ * Neither theora_comment_add() nor theora_comment_add_tag() support
+ * comments containing null values, although the bitstream format
+ * supports this. To add such comments you will need to manipulate
+ * the theora_comment structure directly.
+ **/
+extern void theora_comment_add_tag(theora_comment *tc,
+                                       char *tag, char *value);
+
+/**
+ * Look up a comment value by tag.
+ * \param tc Tn initialized theora_comment structure
+ * \param tag The tag to look up
+ * \param count The instance of the tag. The same tag can appear multiple
+ *              times, each with a distinct and ordered value, so an index
+ *              is required to retrieve them all.
+ * \returns A pointer to the queried tag's value
+ * \retval NULL No matching tag is found
+ *
+ * \note Use theora_comment_query_count() to get the legal range for the
+ * count parameter.
+ **/
+
+extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
+
+/** Look up the number of instances of a tag.
+ *  \param tc An initialized theora_comment structure
+ *  \param tag The tag to look up
+ *  \returns The number on instances of a particular tag.
+ *
+ *  Call this first when querying for a specific tag and then interate
+ *  over the number of instances with separate calls to
+ *  theora_comment_query() to retrieve all instances in order.
+ **/
+extern int   theora_comment_query_count(theora_comment *tc, char *tag);
+
+/**
+ * Clear an allocated theora_comment struct so that it can be freed.
+ * \param tc An allocated theora_comment structure.
+ **/
+extern void  theora_comment_clear(theora_comment *tc);
+
+/**Encoder control function.
+ * This is used to provide advanced control the encoding process.
+ * \param th     A #theora_state handle.
+ * \param req    The control code to process.
+ *                See \ref encctlcodes_old "the list of available
+ *			control codes" for details.
+ * \param buf    The parameters for this control code.
+ * \param buf_sz The size of the parameter buffer.*/
+extern int theora_control(theora_state *th,int req,void *buf,size_t buf_sz);
+
+/* @} */ /* end oldfuncs doxygen group */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _O_THEORA_H_ */

+ 329 - 0
modules/theoraplayer/native/theora/include/theora/theoradec.h

@@ -0,0 +1,329 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
+
+ ********************************************************************/
+
+/**\file
+ * The <tt>libtheoradec</tt> C decoding API.*/
+
+#if !defined(_O_THEORA_THEORADEC_H_)
+# define _O_THEORA_THEORADEC_H_ (1)
+# include <stddef.h>
+# include <ogg/ogg.h>
+# include "codec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+
+/**\name th_decode_ctl() codes
+ * \anchor decctlcodes
+ * These are the available request codes for th_decode_ctl().
+ * By convention, these are odd, to distinguish them from the
+ *  \ref encctlcodes "encoder control codes".
+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
+/*@{*/
+/**Gets the maximum post-processing level.
+ * The decoder supports a post-processing filter that can improve
+ * the appearance of the decoded images. This returns the highest
+ * level setting for this post-processor, corresponding to maximum
+ * improvement and computational expense.
+ *
+ * \param[out] _buf int: The maximum post-processing level.
+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_DECCTL_GET_PPLEVEL_MAX (1)
+/**Sets the post-processing level.
+ * By default, post-processing is disabled.
+ *
+ * Sets the level of post-processing to use when decoding the
+ * compressed stream. This must be a value between zero (off)
+ * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
+ *
+ * \param[in] _buf int: The new post-processing level.
+ *                      0 to disable; larger values use more CPU.
+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(int)</tt>, or the
+ *                     post-processing level is out of bounds.
+ *                    The maximum post-processing level may be
+ *                     implementation-specific, and can be obtained via
+ *                     #TH_DECCTL_GET_PPLEVEL_MAX.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_DECCTL_SET_PPLEVEL (3)
+/**Sets the granule position.
+ * Call this after a seek, before decoding the first frame, to ensure that the
+ *  proper granule position is returned for all subsequent frames.
+ * If you track timestamps yourself and do not use the granule position
+ *  returned by the decoder, then you need not call this function.
+ *
+ * \param[in] _buf <tt>ogg_int64_t</tt>: The granule position of the next
+ *                  frame.
+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(ogg_int64_t)</tt>, or the
+ *                     granule position is negative.*/
+#define TH_DECCTL_SET_GRANPOS (5)
+/**Sets the striped decode callback function.
+ * If set, this function will be called as each piece of a frame is fully
+ *  decoded in th_decode_packetin().
+ * You can pass in a #th_stripe_callback with
+ *  th_stripe_callback#stripe_decoded set to <tt>NULL</tt> to disable the
+ *  callbacks at any point.
+ * Enabling striped decode does not prevent you from calling
+ *  th_decode_ycbcr_out() after the frame is fully decoded.
+ *
+ * \param[in]  _buf #th_stripe_callback: The callback parameters.
+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL  \a _buf_sz is not
+ *                     <tt>sizeof(th_stripe_callback)</tt>.*/
+#define TH_DECCTL_SET_STRIPE_CB (7)
+
+/**Enables telemetry and sets the macroblock display mode */
+#define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
+/**Enables telemetry and sets the motion vector display mode */
+#define TH_DECCTL_SET_TELEMETRY_MV (11)
+/**Enables telemetry and sets the adaptive quantization display mode */
+#define TH_DECCTL_SET_TELEMETRY_QI (13)
+/**Enables telemetry and sets the bitstream breakdown visualization mode */
+#define TH_DECCTL_SET_TELEMETRY_BITS (15)
+/*@}*/
+
+
+
+/**A callback function for striped decode.
+ * This is a function pointer to an application-provided function that will be
+ *  called each time a section of the image is fully decoded in
+ *  th_decode_packetin().
+ * This allows the application to process the section immediately, while it is
+ *  still in cache.
+ * Note that the frame is decoded bottom to top, so \a _yfrag0 will steadily
+ *  decrease with each call until it reaches 0, at which point the full frame
+ *  is decoded.
+ * The number of fragment rows made available in each call depends on the pixel
+ *  format and the number of post-processing filters enabled, and may not even
+ *  be constant for the entire frame.
+ * If a non-<tt>NULL</tt> \a _granpos pointer is passed to
+ *  th_decode_packetin(), the granule position for the frame will be stored
+ *  in it before the first callback is made.
+ * If an entire frame is dropped (a 0-byte packet), then no callbacks will be
+ *  made at all for that frame.
+ * \param _ctx       An application-provided context pointer.
+ * \param _buf       The image buffer for the decoded frame.
+ * \param _yfrag0    The Y coordinate of the first row of 8x8 fragments
+ *                    decoded.
+ *                   Multiply this by 8 to obtain the pixel row number in the
+ *                    luma plane.
+ *                   If the chroma planes are subsampled in the Y direction,
+ *                    this will always be divisible by two.
+ * \param _yfrag_end The Y coordinate of the first row of 8x8 fragments past
+ *                    the newly decoded section.
+ *                   If the chroma planes are subsampled in the Y direction,
+ *                    this will always be divisible by two.
+ *                   I.e., this section contains fragment rows
+ *                    <tt>\a _yfrag0 ...\a _yfrag_end -1</tt>.*/
+typedef void (*th_stripe_decoded_func)(void *_ctx,th_ycbcr_buffer _buf,
+ int _yfrag0,int _yfrag_end);
+
+/**The striped decode callback data to pass to #TH_DECCTL_SET_STRIPE_CB.*/
+typedef struct{
+  /**An application-provided context pointer.
+   * This will be passed back verbatim to the application.*/
+  void                   *ctx;
+  /**The callback function pointer.*/
+  th_stripe_decoded_func  stripe_decoded;
+}th_stripe_callback;
+
+
+
+/**\name Decoder state
+   The following data structures are opaque, and their contents are not
+    publicly defined by this API.
+   Referring to their internals directly is unsupported, and may break without
+    warning.*/
+/*@{*/
+/**The decoder context.*/
+typedef struct th_dec_ctx    th_dec_ctx;
+/**Setup information.
+   This contains auxiliary information (Huffman tables and quantization
+    parameters) decoded from the setup header by th_decode_headerin() to be
+    passed to th_decode_alloc().
+   It can be re-used to initialize any number of decoders, and can be freed
+    via th_setup_free() at any time.*/
+typedef struct th_setup_info th_setup_info;
+/*@}*/
+
+
+
+/**\defgroup decfuncs Functions for Decoding*/
+/*@{*/
+/**\name Functions for decoding
+ * You must link to <tt>libtheoradec</tt> if you use any of the
+ * functions in this section.
+ *
+ * The functions are listed in the order they are used in a typical decode.
+ * The basic steps are:
+ * - Parse the header packets by repeatedly calling th_decode_headerin().
+ * - Allocate a #th_dec_ctx handle with th_decode_alloc().
+ * - Call th_setup_free() to free any memory used for codec setup
+ *    information.
+ * - Perform any additional decoder configuration with th_decode_ctl().
+ * - For each video data packet:
+ *   - Submit the packet to the decoder via th_decode_packetin().
+ *   - Retrieve the uncompressed video data via th_decode_ycbcr_out().
+ * - Call th_decode_free() to release all decoder memory.*/
+/*@{*/
+/**Decodes the header packets of a Theora stream.
+ * This should be called on the initial packets of the stream, in succession,
+ *  until it returns <tt>0</tt>, indicating that all headers have been
+ *  processed, or an error is encountered.
+ * At least three header packets are required, and additional optional header
+ *  packets may follow.
+ * This can be used on the first packet of any logical stream to determine if
+ *  that stream is a Theora stream.
+ * \param _info  A #th_info structure to fill in.
+ *               This must have been previously initialized with
+ *                th_info_init().
+ *               The application may immediately begin using the contents of
+ *                this structure after the first header is decoded, though it
+ *                must continue to be passed in on all subsequent calls.
+ * \param _tc    A #th_comment structure to fill in.
+ *               The application may immediately begin using the contents of
+ *                this structure after the second header is decoded, though it
+ *                must continue to be passed in on all subsequent calls.
+ * \param _setup Returns a pointer to additional, private setup information
+ *                needed by the decoder.
+ *               The contents of this pointer must be initialized to
+ *                <tt>NULL</tt> on the first call, and the returned value must
+ *                continue to be passed in on all subsequent calls.
+ * \param _op    An <tt>ogg_packet</tt> structure which contains one of the
+ *                initial packets of an Ogg logical stream.
+ * \return A positive value indicates that a Theora header was successfully
+ *          processed.
+ * \retval 0             The first video data packet was encountered after all
+ *                        required header packets were parsed.
+ *                       The packet just passed in on this call should be saved
+ *                        and fed to th_decode_packetin() to begin decoding
+ *                        video data.
+ * \retval TH_EFAULT     One of \a _info, \a _tc, or \a _setup was
+ *                        <tt>NULL</tt>.
+ * \retval TH_EBADHEADER \a _op was <tt>NULL</tt>, the packet was not the next
+ *                        header packet in the expected sequence, or the format
+ *                        of the header data was invalid.
+ * \retval TH_EVERSION   The packet data was a Theora info header, but for a
+ *                        bitstream version not decodable with this version of
+ *                        <tt>libtheoradec</tt>.
+ * \retval TH_ENOTFORMAT The packet was not a Theora header.
+ */
+extern int th_decode_headerin(th_info *_info,th_comment *_tc,
+ th_setup_info **_setup,ogg_packet *_op);
+/**Allocates a decoder instance.
+ *
+ * <b>Security Warning:</b> The Theora format supports very large frame sizes,
+ *  potentially even larger than the address space of a 32-bit machine, and
+ *  creating a decoder context allocates the space for several frames of data.
+ * If the allocation fails here, your program will crash, possibly at some
+ *  future point because the OS kernel returned a valid memory range and will
+ *  only fail when it tries to map the pages in it the first time they are
+ *  used.
+ * Even if it succeeds, you may experience a denial of service if the frame
+ *  size is large enough to cause excessive paging.
+ * If you are integrating libtheora in a larger application where such things
+ *  are undesirable, it is highly recommended that you check the frame size in
+ *  \a _info before calling this function and refuse to decode streams where it
+ *  is larger than some reasonable maximum.
+ * libtheora will not check this for you, because there may be machines that
+ *  can handle such streams and applications that wish to.
+ * \param _info  A #th_info struct filled via th_decode_headerin().
+ * \param _setup A #th_setup_info handle returned via
+ *                th_decode_headerin().
+ * \return The initialized #th_dec_ctx handle.
+ * \retval NULL If the decoding parameters were invalid.*/
+extern th_dec_ctx *th_decode_alloc(const th_info *_info,
+ const th_setup_info *_setup);
+/**Releases all storage used for the decoder setup information.
+ * This should be called after you no longer want to create any decoders for
+ *  a stream whose headers you have parsed with th_decode_headerin().
+ * \param _setup The setup information to free.
+ *               This can safely be <tt>NULL</tt>.*/
+extern void th_setup_free(th_setup_info *_setup);
+/**Decoder control function.
+ * This is used to provide advanced control of the decoding process.
+ * \param _dec    A #th_dec_ctx handle.
+ * \param _req    The control code to process.
+ *                See \ref decctlcodes "the list of available control codes"
+ *                 for details.
+ * \param _buf    The parameters for this control code.
+ * \param _buf_sz The size of the parameter buffer.
+ * \return Possible return values depend on the control code used.
+ *          See \ref decctlcodes "the list of control codes" for
+ *          specific values. Generally 0 indicates success.*/
+extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
+ size_t _buf_sz);
+/**Submits a packet containing encoded video data to the decoder.
+ * \param _dec     A #th_dec_ctx handle.
+ * \param _op      An <tt>ogg_packet</tt> containing encoded video data.
+ * \param _granpos Returns the granule position of the decoded packet.
+ *                 If non-<tt>NULL</tt>, the granule position for this specific
+ *                  packet is stored in this location.
+ *                 This is computed incrementally from previously decoded
+ *                  packets.
+ *                 After a seek, the correct granule position must be set via
+ *                  #TH_DECCTL_SET_GRANPOS for this to work properly.
+ * \retval 0             Success.
+ *                       A new decoded frame can be retrieved by calling
+ *                        th_decode_ycbcr_out().
+ * \retval TH_DUPFRAME   The packet represented a dropped frame (either a
+ *                        0-byte frame or an INTER frame with no coded blocks).
+ *                       The player can skip the call to th_decode_ycbcr_out(),
+ *                        as the contents of the decoded frame buffer have not
+ *                        changed.
+ * \retval TH_EFAULT     \a _dec or \a _op was <tt>NULL</tt>.
+ * \retval TH_EBADPACKET \a _op does not contain encoded video data.
+ * \retval TH_EIMPL      The video data uses bitstream features which this
+ *                        library does not support.*/
+extern int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
+ ogg_int64_t *_granpos);
+/**Outputs the next available frame of decoded Y'CbCr data.
+ * If a striped decode callback has been set with #TH_DECCTL_SET_STRIPE_CB,
+ *  then the application does not need to call this function.
+ * \param _dec   A #th_dec_ctx handle.
+ * \param _ycbcr A video buffer structure to fill in.
+ *               <tt>libtheoradec</tt> will fill in all the members of this
+ *                structure, including the pointers to the uncompressed video
+ *                data.
+ *               The memory for this video data is owned by
+ *                <tt>libtheoradec</tt>.
+ *               It may be freed or overwritten without notification when
+ *                subsequent frames are decoded.
+ * \retval 0 Success
+ * \retval TH_EFAULT     \a _dec or \a _ycbcr was <tt>NULL</tt>.
+ */
+extern int th_decode_ycbcr_out(th_dec_ctx *_dec,
+ th_ycbcr_buffer _ycbcr);
+/**Frees an allocated decoder instance.
+ * \param _dec A #th_dec_ctx handle.*/
+extern void th_decode_free(th_dec_ctx *_dec);
+/*@}*/
+/*@}*/
+
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif

+ 548 - 0
modules/theoraplayer/native/theora/include/theora/theoraenc.h

@@ -0,0 +1,548 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
+
+ ********************************************************************/
+
+/**\file
+ * The <tt>libtheoraenc</tt> C encoding API.*/
+
+#if !defined(_O_THEORA_THEORAENC_H_)
+# define _O_THEORA_THEORAENC_H_ (1)
+# include <stddef.h>
+# include <ogg/ogg.h>
+# include "codec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+
+/**\name th_encode_ctl() codes
+ * \anchor encctlcodes
+ * These are the available request codes for th_encode_ctl().
+ * By convention, these are even, to distinguish them from the
+ *  \ref decctlcodes "decoder control codes".
+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
+/*@{*/
+/**Sets the Huffman tables to use.
+ * The tables are copied, not stored by reference, so they can be freed after
+ *  this call.
+ * <tt>NULL</tt> may be specified to revert to the default tables.
+ *
+ * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
+ * \retval TH_EINVAL Encoding has already begun or one or more of the given
+ *                     tables is not full or prefix-free, \a _buf is
+ *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
+ *                     non-<tt>NULL</tt> and \a _buf_sz is not
+ *                     <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_HUFFMAN_CODES (0)
+/**Sets the quantization parameters to use.
+ * The parameters are copied, not stored by reference, so they can be freed
+ *  after this call.
+ * <tt>NULL</tt> may be specified to revert to the default parameters.
+ *
+ * \param[in] _buf #th_quant_info
+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
+ * \retval TH_EINVAL Encoding has already begun, \a _buf is
+ *                    <tt>NULL</tt> and \a _buf_sz is not zero,
+ *                    or \a _buf is non-<tt>NULL</tt> and
+ *                    \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_QUANT_PARAMS (2)
+/**Sets the maximum distance between key frames.
+ * This can be changed during an encode, but will be bounded by
+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
+ *  be enlarged appropriately.
+ *
+ * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
+ *                   frames.
+ * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
+/**Disables any encoder features that would prevent lossless transcoding back
+ *  to VP3.
+ * This primarily means disabling block-adaptive quantization and always coding
+ *  all four luma blocks in a macro block when 4MV is used.
+ * It also includes using the VP3 quantization tables and Huffman codes; if you
+ *  set them explicitly after calling this function, the resulting stream will
+ *  not be VP3-compatible.
+ * If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
+ *  material, or when using a picture region smaller than the full frame (e.g.
+ *  a non-multiple-of-16 width or height), then non-VP3 bitstream features will
+ *  still be disabled, but the stream will still not be VP3-compatible, as VP3
+ *  was not capable of encoding such formats.
+ * If you call this after encoding has already begun, then the quantization
+ *  tables and codebooks cannot be changed, but the frame-level features will
+ *  be enabled or disabled as requested.
+ *
+ * \param[in]  _buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
+ *                   or 0 to disable it (the default).
+ * \param[out] _buf <tt>int</tt>: 1 if all bitstream features required for
+ *                   VP3-compatibility could be set, and 0 otherwise.
+ *                  The latter will be returned if the pixel format is not
+ *                   4:2:0, the picture region is smaller than the full frame,
+ *                   or if encoding has begun, preventing the quantization
+ *                   tables and codebooks from being set.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
+/**Gets the maximum speed level.
+ * Higher speed levels favor quicker encoding over better quality per bit.
+ * Depending on the encoding mode, and the internal algorithms used, quality
+ *  may actually improve, but in this case bitrate will also likely increase.
+ * In any case, overall rate/distortion performance will probably decrease.
+ * The maximum value, and the meaning of each value, may change depending on
+ *  the current encoding mode (VBR vs. constant quality, etc.).
+ *
+ * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
+/**Sets the speed level.
+ * The current speed level may be retrieved using #TH_ENCCTL_GET_SPLEVEL.
+ *
+ * \param[in] _buf <tt>int</tt>: The new encoding speed level.
+ *                 0 is slowest, larger values use less CPU.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
+ *                    encoding speed level is out of bounds.
+ *                   The maximum encoding speed level may be
+ *                    implementation- and encoding mode-specific, and can be
+ *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_SPLEVEL (14)
+/**Gets the current speed level.
+ * The default speed level may vary according to encoder implementation, but if
+ *  this control code is not supported (it returns #TH_EIMPL), the default may
+ *  be assumed to be the slowest available speed (0).
+ * The maximum encoding speed level may be implementation- and encoding
+ *  mode-specific, and can be obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
+ *
+ * \param[out] _buf <tt>int</tt>: The current encoding speed level.
+ *                  0 is slowest, larger values use less CPU.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_GET_SPLEVEL (16)
+/**Sets the number of duplicates of the next frame to produce.
+ * Although libtheora can encode duplicate frames very cheaply, it costs some
+ *  amount of CPU to detect them, and a run of duplicates cannot span a
+ *  keyframe boundary.
+ * This control code tells the encoder to produce the specified number of extra
+ *  duplicates of the next frame.
+ * This allows the encoder to make smarter keyframe placement decisions and
+ *  rate control decisions, and reduces CPU usage as well, when compared to
+ *  just submitting the same frame for encoding multiple times.
+ * This setting only applies to the next frame submitted for encoding.
+ * You MUST call th_encode_packetout() repeatedly until it returns 0, or the
+ *  extra duplicate frames will be lost.
+ *
+ * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
+ *                 If this is negative or zero, no duplicates will be produced.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
+ *                    number of duplicates is greater than or equal to the
+ *                    maximum keyframe interval.
+ *                   In the latter case, NO duplicate frames will be produced.
+ *                   You must ensure that the maximum keyframe interval is set
+ *                    larger than the maximum number of duplicates you will
+ *                    ever wish to insert prior to encoding.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_DUP_COUNT (18)
+/**Modifies the default bitrate management behavior.
+ * Use to allow or disallow frame dropping, and to enable or disable capping
+ *  bit reservoir overflows and underflows.
+ * See \ref encctlcodes "the list of available flags".
+ * The flags are set by default to
+ *  <tt>#TH_RATECTL_DROP_FRAMES|#TH_RATECTL_CAP_OVERFLOW</tt>.
+ *
+ * \param[in] _buf <tt>int</tt>: Any combination of
+ *                  \ref ratectlflags "the available flags":
+ *                 - #TH_RATECTL_DROP_FRAMES: Enable frame dropping.
+ *                 - #TH_RATECTL_CAP_OVERFLOW: Don't bank excess bits for later
+ *                    use.
+ *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
+ *                    later.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
+ *                    is not enabled.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_RATE_FLAGS (20)
+/**Sets the size of the bitrate management bit reservoir as a function
+ *  of number of frames.
+ * The reservoir size affects how quickly bitrate management reacts to
+ *  instantaneous changes in the video complexity.
+ * Larger reservoirs react more slowly, and provide better overall quality, but
+ *  require more buffering by a client, adding more latency to live streams.
+ * By default, libtheora sets the reservoir to the maximum distance between
+ *  keyframes, subject to a minimum and maximum limit.
+ * This call may be used to increase or decrease the reservoir, increasing or
+ *  decreasing the allowed temporary variance in bitrate.
+ * An implementation may impose some limits on the size of a reservoir it can
+ *  handle, in which case the actual reservoir size may not be exactly what was
+ *  requested.
+ * The actual value set will be returned.
+ *
+ * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
+ *                   frames.
+ * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
+ *                    is not enabled.  The buffer has an implementation
+ *                    defined minimum and maximum size and the value in _buf
+ *                    will be adjusted to match the actual value set.
+ * \retval TH_EIMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_RATE_BUFFER (22)
+/**Enable pass 1 of two-pass encoding mode and retrieve the first pass metrics.
+ * Pass 1 mode must be enabled before the first frame is encoded, and a target
+ *  bitrate must have already been specified to the encoder.
+ * Although this does not have to be the exact rate that will be used in the
+ *  second pass, closer values may produce better results.
+ * The first call returns the size of the two-pass header data, along with some
+ *  placeholder content, and sets the encoder into pass 1 mode implicitly.
+ * This call sets the encoder to pass 1 mode implicitly.
+ * Then, a subsequent call must be made after each call to
+ *  th_encode_ycbcr_in() to retrieve the metrics for that frame.
+ * An additional, final call must be made to retrieve the summary data,
+ *  containing such information as the total number of frames, etc.
+ * This must be stored in place of the placeholder data that was returned
+ *  in the first call, before the frame metrics data.
+ * All of this data must be presented back to the encoder during pass 2 using
+ *  #TH_ENCCTL_2PASS_IN.
+ *
+ * \param[out] <tt>char *</tt>_buf: Returns a pointer to internal storage
+ *              containing the two pass metrics data.
+ *             This storage is only valid until the next call, or until the
+ *              encoder context is freed, and must be copied by the
+ *              application.
+ * \retval >=0       The number of bytes of metric data available in the
+ *                    returned buffer.
+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
+ *                    bitrate has been set, or the first call was made after
+ *                    the first frame was submitted for encoding.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_2PASS_OUT (24)
+/**Submits two-pass encoding metric data collected the first encoding pass to
+ *  the second pass.
+ * The first call must be made before the first frame is encoded, and a target
+ *  bitrate must have already been specified to the encoder.
+ * It sets the encoder to pass 2 mode implicitly; this cannot be disabled.
+ * The encoder may require reading data from some or all of the frames in
+ *  advance, depending on, e.g., the reservoir size used in the second pass.
+ * You must call this function repeatedly before each frame to provide data
+ *  until either a) it fails to consume all of the data presented or b) all of
+ *  the pass 1 data has been consumed.
+ * In the first case, you must save the remaining data to be presented after
+ *  the next frame.
+ * You can call this function with a NULL argument to get an upper bound on
+ *  the number of bytes that will be required before the next frame.
+ *
+ * When pass 2 is first enabled, the default bit reservoir is set to the entire
+ *  file; this gives maximum flexibility but can lead to very high peak rates.
+ * You can subsequently set it to another value with #TH_ENCCTL_SET_RATE_BUFFER
+ *  (e.g., to set it to the keyframe interval for non-live streaming), however,
+ *  you may then need to provide more data before the next frame.
+ *
+ * \param[in] _buf <tt>char[]</tt>: A buffer containing the data returned by
+ *                  #TH_ENCCTL_2PASS_OUT in pass 1.
+ *                 You may pass <tt>NULL</tt> for \a _buf to return an upper
+ *                  bound on the number of additional bytes needed before the
+ *                  next frame.
+ *                 The summary data returned at the end of pass 1 must be at
+ *                  the head of the buffer on the first call with a
+ *                  non-<tt>NULL</tt> \a _buf, and the placeholder data
+ *                  returned at the start of pass 1 should be omitted.
+ *                 After each call you should advance this buffer by the number
+ *                  of bytes consumed.
+ * \retval >0            The number of bytes of metric data required/consumed.
+ * \retval 0             No more data is required before the next frame.
+ * \retval TH_EFAULT     \a _enc is <tt>NULL</tt>.
+ * \retval TH_EINVAL     No target bitrate has been set, or the first call was
+ *                        made after the first frame was submitted for
+ *                        encoding.
+ * \retval TH_ENOTFORMAT The data did not appear to be pass 1 from a compatible
+ *                        implementation of this library.
+ * \retval TH_EBADHEADER The data was invalid; this may be returned when
+ *                        attempting to read an aborted pass 1 file that still
+ *                        has the placeholder data in place of the summary
+ *                        data.
+ * \retval TH_EIMPL       Not supported by this implementation.*/
+#define TH_ENCCTL_2PASS_IN (26)
+/**Sets the current encoding quality.
+ * This is only valid so long as no bitrate has been specified, either through
+ *  the #th_info struct used to initialize the encoder or through
+ *  #TH_ENCCTL_SET_BITRATE (this restriction may be relaxed in a future
+ *  version).
+ * If it is set before the headers are emitted, the target quality encoded in
+ *  them will be updated.
+ *
+ * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
+ *                  inclusive.
+ * \retval 0             Success.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     A target bitrate has already been specified, or the
+ *                        quality index was not in the range 0...63.
+ * \retval TH_EIMPL       Not supported by this implementation.*/
+#define TH_ENCCTL_SET_QUALITY (28)
+/**Sets the current encoding bitrate.
+ * Once a bitrate is set, the encoder must use a rate-controlled mode for all
+ *  future frames (this restriction may be relaxed in a future version).
+ * If it is set before the headers are emitted, the target bitrate encoded in
+ *  them will be updated.
+ * Due to the buffer delay, the exact bitrate of each section of the encode is
+ *  not guaranteed.
+ * The encoder may have already used more bits than allowed for the frames it
+ *  has encoded, expecting to make them up in future frames, or it may have
+ *  used fewer, holding the excess in reserve.
+ * The exact transition between the two bitrates is not well-defined by this
+ *  API, but may be affected by flags set with #TH_ENCCTL_SET_RATE_FLAGS.
+ * After a number of frames equal to the buffer delay, one may expect further
+ *  output to average at the target bitrate.
+ *
+ * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
+ * \retval 0             Success.
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     The target bitrate was not positive.
+ *                       A future version of this library may allow passing 0
+ *                        to disabled rate-controlled mode and return to a
+ *                        quality-based mode, in which case this function will
+ *                        not return an error for that value.
+ * \retval TH_EIMPL      Not supported by this implementation.*/
+#define TH_ENCCTL_SET_BITRATE (30)
+/**Sets the configuration to be compatible with that from the given setup
+ *  header.
+ * This sets the Huffman codebooks and quantization parameters to match those
+ *  found in the given setup header.
+ * This guarantees that packets encoded by this encoder will be decodable using
+ *  a decoder configured with the passed-in setup header.
+ * It does <em>not</em> guarantee that th_encode_flushheader() will produce a
+ *  bit-identical setup header, only that they will be compatible.
+ * If you need a bit-identical setup header, then use the one you passed into
+ *  this command, and not the one returned by th_encode_flushheader().
+ *
+ * This also does <em>not</em> enable or disable VP3 compatibility; that is not
+ *  signaled in the setup header (or anywhere else in the encoded stream), and
+ *  is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
+ * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
+ *  and quantization parameters to match the given setup header, you should
+ *  enable VP3 compatibility before invoking this command, otherwise the
+ *  codebooks and quantization parameters will be reset to the VP3 defaults.
+ *
+ * The current encoder does not support Huffman codebooks which do not contain
+ *  codewords for all 32 tokens.
+ * Such codebooks are legal, according to the specification, but cannot be
+ *  configured with this function.
+ *
+ * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
+ *                                            the configuration from.
+ *                                           This should be the original,
+ *                                            undecoded setup header packet,
+ *                                            and <em>not</em> a #th_setup_info
+ *                                            structure filled in by
+ *                                            th_decode_headerin().
+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL     Encoding has already begun, so the codebooks and
+ *                        quantization parameters cannot be changed, or the
+ *                        data in the setup header was not supported by this
+ *                        encoder.
+ * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
+ * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
+
+/*@}*/
+
+
+/**\name TH_ENCCTL_SET_RATE_FLAGS flags
+ * \anchor ratectlflags
+ * These are the flags available for use with #TH_ENCCTL_SET_RATE_FLAGS.*/
+/*@{*/
+/**Drop frames to keep within bitrate buffer constraints.
+ * This can have a severe impact on quality, but is the only way to ensure that
+ *  bitrate targets are met at low rates during sudden bursts of activity.
+ * It is enabled by default.*/
+#define TH_RATECTL_DROP_FRAMES   (0x1)
+/**Ignore bitrate buffer overflows.
+ * If the encoder uses so few bits that the reservoir of available bits
+ *  overflows, ignore the excess.
+ * The encoder will not try to use these extra bits in future frames.
+ * At high rates this may cause the result to be undersized, but allows a
+ *  client to play the stream using a finite buffer; it should normally be
+ *  enabled, which is the default.*/
+#define TH_RATECTL_CAP_OVERFLOW  (0x2)
+/**Ignore bitrate buffer underflows.
+ * If the encoder uses so many bits that the reservoir of available bits
+ *  underflows, ignore the deficit.
+ * The encoder will not try to make up these extra bits in future frames.
+ * At low rates this may cause the result to be oversized; it should normally
+ *  be disabled, which is the default.*/
+#define TH_RATECTL_CAP_UNDERFLOW (0x4)
+/*@}*/
+
+
+
+/**The quantization parameters used by VP3.*/
+extern const th_quant_info TH_VP31_QUANT_INFO;
+
+/**The Huffman tables used by VP3.*/
+extern const th_huff_code
+ TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+
+
+
+/**\name Encoder state
+   The following data structure is opaque, and its contents are not publicly
+    defined by this API.
+   Referring to its internals directly is unsupported, and may break without
+    warning.*/
+/*@{*/
+/**The encoder context.*/
+typedef struct th_enc_ctx    th_enc_ctx;
+/*@}*/
+
+
+
+/**\defgroup encfuncs Functions for Encoding*/
+/*@{*/
+/**\name Functions for encoding
+ * You must link to <tt>libtheoraenc</tt> and <tt>libtheoradec</tt>
+ *  if you use any of the functions in this section.
+ *
+ * The functions are listed in the order they are used in a typical encode.
+ * The basic steps are:
+ * - Fill in a #th_info structure with details on the format of the video you
+ *    wish to encode.
+ * - Allocate a #th_enc_ctx handle with th_encode_alloc().
+ * - Perform any additional encoder configuration required with
+ *    th_encode_ctl().
+ * - Repeatedly call th_encode_flushheader() to retrieve all the header
+ *    packets.
+ * - For each uncompressed frame:
+ *   - Submit the uncompressed frame via th_encode_ycbcr_in()
+ *   - Repeatedly call th_encode_packetout() to retrieve any video
+ *      data packets that are ready.
+ * - Call th_encode_free() to release all encoder memory.*/
+/*@{*/
+/**Allocates an encoder instance.
+ * \param _info A #th_info struct filled with the desired encoding parameters.
+ * \return The initialized #th_enc_ctx handle.
+ * \retval NULL If the encoding parameters were invalid.*/
+extern th_enc_ctx *th_encode_alloc(const th_info *_info);
+/**Encoder control function.
+ * This is used to provide advanced control the encoding process.
+ * \param _enc    A #th_enc_ctx handle.
+ * \param _req    The control code to process.
+ *                See \ref encctlcodes "the list of available control codes"
+ *                 for details.
+ * \param _buf    The parameters for this control code.
+ * \param _buf_sz The size of the parameter buffer.
+ * \return Possible return values depend on the control code used.
+ *          See \ref encctlcodes "the list of control codes" for
+ *          specific values. Generally 0 indicates success.*/
+extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
+/**Outputs the next header packet.
+ * This should be called repeatedly after encoder initialization until it
+ *  returns 0 in order to get all of the header packets, in order, before
+ *  encoding actual video data.
+ * \param _enc      A #th_enc_ctx handle.
+ * \param _comments The metadata to place in the comment header, when it is
+ *                   encoded.
+ * \param _op       An <tt>ogg_packet</tt> structure to fill.
+ *                  All of the elements of this structure will be set,
+ *                   including a pointer to the header data.
+ *                  The memory for the header data is owned by
+ *                   <tt>libtheoraenc</tt>, and may be invalidated when the
+ *                   next encoder function is called.
+ * \return A positive value indicates that a header packet was successfully
+ *          produced.
+ * \retval 0         No packet was produced, and no more header packets remain.
+ * \retval TH_EFAULT \a _enc, \a _comments, or \a _op was <tt>NULL</tt>.*/
+extern int th_encode_flushheader(th_enc_ctx *_enc,
+ th_comment *_comments,ogg_packet *_op);
+/**Submits an uncompressed frame to the encoder.
+ * \param _enc   A #th_enc_ctx handle.
+ * \param _ycbcr A buffer of Y'CbCr data to encode.
+ *               If the width and height of the buffer matches the frame size
+ *                the encoder was initialized with, the encoder will only
+ *                reference the portion inside the picture region.
+ *               Any data outside this region will be ignored, and need not map
+ *                to a valid address.
+ *               Alternatively, you can pass a buffer equal to the size of the
+ *                picture region, if this is less than the full frame size.
+ *               When using subsampled chroma planes, odd picture sizes or odd
+ *                picture offsets may require an unexpected chroma plane size,
+ *                and their use is generally discouraged, as they will not be
+ *                well-supported by players and other media frameworks.
+ *               See Section 4.4 of
+ *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
+ *                specification</a> for details if you wish to use them anyway.
+ * \retval 0         Success.
+ * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
+ * \retval TH_EINVAL The buffer size matches neither the frame size nor the
+ *                    picture size the encoder was initialized with, or
+ *                    encoding has already completed.*/
+extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
+/**Retrieves encoded video data packets.
+ * This should be called repeatedly after each frame is submitted to flush any
+ *  encoded packets, until it returns 0.
+ * The encoder will not buffer these packets as subsequent frames are
+ *  compressed, so a failure to do so will result in lost video data.
+ * \note Currently the encoder operates in a one-frame-in, one-packet-out
+ *        manner.
+ *       However, this may be changed in the future.
+ * \param _enc  A #th_enc_ctx handle.
+ * \param _last Set this flag to a non-zero value if no more uncompressed
+ *               frames will be submitted.
+ *              This ensures that a proper EOS flag is set on the last packet.
+ * \param _op   An <tt>ogg_packet</tt> structure to fill.
+ *              All of the elements of this structure will be set, including a
+ *               pointer to the video data.
+ *              The memory for the video data is owned by
+ *               <tt>libtheoraenc</tt>, and may be invalidated when the next
+ *               encoder function is called.
+ * \return A positive value indicates that a video data packet was successfully
+ *          produced.
+ * \retval 0         No packet was produced, and no more encoded video data
+ *                    remains.
+ * \retval TH_EFAULT \a _enc or \a _op was <tt>NULL</tt>.*/
+extern int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op);
+/**Frees an allocated encoder instance.
+ * \param _enc A #th_enc_ctx handle.*/
+extern void th_encode_free(th_enc_ctx *_enc);
+/*@}*/
+/*@}*/
+
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif

+ 53 - 0
modules/theoraplayer/native/theora/lib/Version_script

@@ -0,0 +1,53 @@
+#
+# Export file for libtheora
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_version_string;
+		theora_version_number;
+
+		theora_encode_init;
+		theora_encode_YUVin;
+		theora_encode_packetout;
+		theora_encode_header;
+		theora_encode_comment;
+		theora_encode_tables;
+
+		theora_decode_header;
+		theora_decode_init;
+		theora_decode_packetin;
+		theora_decode_YUVout;
+
+		theora_control;
+
+		theora_packet_isheader;
+		theora_packet_iskeyframe;
+
+		theora_granule_shift;
+		theora_granule_frame;
+		theora_granule_time;
+
+		theora_info_init;
+		theora_info_clear;
+
+		theora_clear;
+
+		theora_comment_init;
+		theora_comment_add;
+		theora_comment_add_tag;
+		theora_comment_query;
+		theora_comment_query_count;
+		theora_comment_clear;
+
+	local:
+		*;
+};

+ 82 - 0
modules/theoraplayer/native/theora/lib/Version_script-dec

@@ -0,0 +1,82 @@
+#
+# Export file for libtheoradec
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# The 1.x API
+libtheoradec_1.0
+{
+	global:
+		th_version_string;
+		th_version_number;
+
+		th_decode_headerin;
+		th_decode_alloc;
+		th_setup_free;
+		th_decode_ctl;
+		th_decode_packetin;
+		th_decode_ycbcr_out;
+		th_decode_free;
+
+		th_packet_isheader;
+		th_packet_iskeyframe;
+
+		th_granule_frame;
+		th_granule_time;
+
+		th_info_init;
+		th_info_clear;
+
+		th_comment_init;
+		th_comment_add;
+		th_comment_add_tag;
+		th_comment_query;
+		th_comment_query_count;
+		th_comment_clear;
+
+	local:
+		*;
+};
+
+# The deprecated legacy api from the libtheora alpha releases.
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_version_string;
+		theora_version_number;
+
+		theora_decode_header;
+		theora_decode_init;
+		theora_decode_packetin;
+		theora_decode_YUVout;
+
+		theora_control;
+
+		theora_packet_isheader;
+		theora_packet_iskeyframe;
+
+		theora_granule_shift;
+		theora_granule_frame;
+		theora_granule_time;
+
+		theora_info_init;
+		theora_info_clear;
+
+		theora_clear;
+
+		theora_comment_init;
+		theora_comment_add;
+		theora_comment_add_tag;
+		theora_comment_query;
+		theora_comment_query_count;
+		theora_comment_clear;
+
+	local:
+		*;
+};

+ 43 - 0
modules/theoraplayer/native/theora/lib/Version_script-enc

@@ -0,0 +1,43 @@
+#
+# Export file for libtheora
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# The 1.x encoder API
+libtheoraenc_1.0
+{
+	global:
+		th_encode_alloc;
+		th_encode_ctl;
+		th_encode_flushheader;
+		th_encode_ycbcr_in;
+		th_encode_packetout;
+		th_encode_free;
+
+		TH_VP31_QUANT_INFO;
+		TH_VP31_HUFF_CODES;
+
+	local:
+		*;
+};
+
+# The encoder portion of the deprecated alpha release api.
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_encode_init;
+		theora_encode_YUVin;
+		theora_encode_packetout;
+		theora_encode_header;
+		theora_encode_comment;
+		theora_encode_tables;
+
+	local:
+		*;
+};

+ 2712 - 0
modules/theoraplayer/native/theora/lib/analyze.c

@@ -0,0 +1,2712 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#include <limits.h>
+#include <string.h>
+#include "encint.h"
+#include "modedec.h"
+#if defined(OC_COLLECT_METRICS)
+# include "collect.c"
+#endif
+
+
+
+typedef struct oc_rd_metric          oc_rd_metric;
+typedef struct oc_mode_choice        oc_mode_choice;
+
+
+
+/*There are 8 possible schemes used to encode macro block modes.
+  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
+  The same set of Huffman codes is used for each of these 7 schemes, but the
+   mode assigned to each codeword varies.
+  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
+   while schemes 1-6 have a fixed mapping.
+  Scheme 7 just encodes each mode directly in 3 bits.*/
+
+/*The mode orderings for the various mode coding schemes.
+  Scheme 0 uses a custom alphabet, which is not stored in this table.
+  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
+   decoder.*/
+static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
+  /*Last MV dominates.*/
+  /*L P M N I G GM 4*/
+  {3,4,2,0,1,5,6,7},
+  /*L P N M I G GM 4*/
+  {2,4,3,0,1,5,6,7},
+  /*L M P N I G GM 4*/
+  {3,4,1,0,2,5,6,7},
+  /*L M N P I G GM 4*/
+  {2,4,1,0,3,5,6,7},
+  /*No MV dominates.*/
+  /*N L P M I G GM 4*/
+  {0,4,3,1,2,5,6,7},
+  /*N G L P M I GM 4*/
+  {0,5,4,2,3,1,6,7},
+  /*Default ordering.*/
+  /*N I M L P G GM 4*/
+  {0,1,2,3,4,5,6,7}
+};
+
+
+
+/*Initialize the mode scheme chooser.
+  This need only be called once per encoder.*/
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
+  int si;
+  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
+  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
+}
+
+/*Reset the mode scheme chooser.
+  This needs to be called once for each frame, including the first.*/
+static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
+  int si;
+  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
+  /*Scheme 0 starts with 24 bits to store the mode list in.*/
+  _chooser->scheme_bits[0]=24;
+  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
+  for(si=0;si<8;si++){
+    /*Scheme 7 should always start first, and scheme 0 should always start
+       last.*/
+    _chooser->scheme_list[si]=7-si;
+    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
+  }
+}
+
+/*Return the cost of coding _mb_mode in the specified scheme.*/
+static int oc_mode_scheme_chooser_scheme_mb_cost(
+ const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
+  int codebook;
+  int ri;
+  codebook=_scheme+1>>3;
+  /*For any scheme except 0, we can just use the bit cost of the mode's rank
+     in that scheme.*/
+  ri=_chooser->mode_ranks[_scheme][_mb_mode];
+  if(_scheme==0){
+    int mc;
+    /*For scheme 0, incrementing the mode count could potentially change the
+       mode's rank.
+      Find the index where the mode would be moved to in the optimal list,
+       and use its bit cost instead of the one for the mode's current
+       position in the list.*/
+    /*We don't actually reorder the list; this is for computing opportunity
+       cost, not an update.*/
+    mc=_chooser->mode_counts[_mb_mode];
+    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
+  }
+  return OC_MODE_BITS[codebook][ri];
+}
+
+/*This is the real purpose of this data structure: not actually selecting a
+   mode scheme, but estimating the cost of coding a given mode given all the
+   modes selected so far.
+  This is done via opportunity cost: the cost is defined as the number of bits
+   required to encode all the modes selected so far including the current one
+   using the best possible scheme, minus the number of bits required to encode
+   all the modes selected so far not including the current one using the best
+   possible scheme.
+  The computational expense of doing this probably makes it overkill.
+  Just be happy we take a greedy approach instead of trying to solve the
+   global mode-selection problem (which is NP-hard).
+  _mb_mode: The mode to determine the cost of.
+  Return: The number of bits required to code this mode.*/
+static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int scheme0;
+  int scheme1;
+  int best_bits;
+  int mode_bits;
+  int si;
+  int scheme0_bits;
+  int scheme1_bits;
+  scheme0=_chooser->scheme_list[0];
+  scheme1=_chooser->scheme_list[1];
+  scheme0_bits=_chooser->scheme_bits[scheme0];
+  scheme1_bits=_chooser->scheme_bits[scheme1];
+  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
+  /*Typical case: If the difference between the best scheme and the next best
+     is greater than 6 bits, then adding just one mode cannot change which
+     scheme we use.*/
+  if(scheme1_bits-scheme0_bits>6)return mode_bits;
+  /*Otherwise, check to see if adding this mode selects a different scheme as
+     the best.*/
+  si=1;
+  best_bits=scheme0_bits+mode_bits;
+  do{
+    int cur_bits;
+    cur_bits=scheme1_bits+
+     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
+    if(cur_bits<best_bits)best_bits=cur_bits;
+    if(++si>=8)break;
+    scheme1=_chooser->scheme_list[si];
+    scheme1_bits=_chooser->scheme_bits[scheme1];
+  }
+  while(scheme1_bits-scheme0_bits<=6);
+  return best_bits-scheme0_bits;
+}
+
+/*Incrementally update the mode counts and per-scheme bit counts and re-order
+   the scheme lists once a mode has been selected.
+  _mb_mode: The mode that was chosen.*/
+static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int ri;
+  int si;
+  _chooser->mode_counts[_mb_mode]++;
+  /*Re-order the scheme0 mode list if necessary.*/
+  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
+    int pmode;
+    pmode=_chooser->scheme0_list[ri-1];
+    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
+    /*Reorder the mode ranking.*/
+    _chooser->scheme0_ranks[pmode]++;
+    _chooser->scheme0_list[ri]=pmode;
+  }
+  _chooser->scheme0_ranks[_mb_mode]=ri;
+  _chooser->scheme0_list[ri]=_mb_mode;
+  /*Now add the bit cost for the mode to each scheme.*/
+  for(si=0;si<8;si++){
+    _chooser->scheme_bits[si]+=
+     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
+  }
+  /*Finally, re-order the list of schemes.*/
+  for(si=1;si<8;si++){
+    int sj;
+    int scheme0;
+    int bits0;
+    sj=si;
+    scheme0=_chooser->scheme_list[si];
+    bits0=_chooser->scheme_bits[scheme0];
+    do{
+      int scheme1;
+      scheme1=_chooser->scheme_list[sj-1];
+      if(bits0>=_chooser->scheme_bits[scheme1])break;
+      _chooser->scheme_list[sj]=scheme1;
+    }
+    while(--sj>0);
+    _chooser->scheme_list[sj]=scheme0;
+  }
+}
+
+
+
+/*The number of bits required to encode a super block run.
+  _run_count: The desired run count; must be positive and less than 4130.*/
+static int oc_sb_run_bits(int _run_count){
+  int i;
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  return OC_SB_RUN_CODE_NBITS[i];
+}
+
+/*The number of bits required to encode a block run.
+  _run_count: The desired run count; must be positive and less than 30.*/
+static int oc_block_run_bits(int _run_count){
+  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
+}
+
+
+
+static void oc_fr_state_init(oc_fr_state *_fr){
+  _fr->bits=0;
+  _fr->sb_partial_count=0;
+  _fr->sb_full_count=0;
+  _fr->b_coded_count_prev=0;
+  _fr->b_coded_count=0;
+  _fr->b_count=0;
+  _fr->sb_prefer_partial=0;
+  _fr->sb_bits=0;
+  _fr->sb_partial=-1;
+  _fr->sb_full=-1;
+  _fr->b_coded_prev=-1;
+  _fr->b_coded=-1;
+}
+
+
+static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
+ int _sb_partial,int _sb_full){
+  int bits;
+  int sb_partial_count;
+  int sb_full_count;
+  bits=0;
+  sb_partial_count=_fr->sb_partial_count;
+  /*Extend the sb_partial run, or start a new one.*/
+  if(_fr->sb_partial==_sb_partial){
+    if(sb_partial_count>=4129){
+      bits++;
+      sb_partial_count=0;
+    }
+    else bits-=oc_sb_run_bits(sb_partial_count);
+  }
+  else sb_partial_count=0;
+  bits+=oc_sb_run_bits(++sb_partial_count);
+  if(!_sb_partial){
+    /*Extend the sb_full run, or start a new one.*/
+    sb_full_count=_fr->sb_full_count;
+    if(_fr->sb_full==_sb_full){
+      if(sb_full_count>=4129){
+        bits++;
+        sb_full_count=0;
+      }
+      else bits-=oc_sb_run_bits(sb_full_count);
+    }
+    else sb_full_count=0;
+    bits+=oc_sb_run_bits(++sb_full_count);
+  }
+  return bits;
+}
+
+static void oc_fr_state_advance_sb(oc_fr_state *_fr,
+ int _sb_partial,int _sb_full){
+  int sb_partial_count;
+  int sb_full_count;
+  sb_partial_count=_fr->sb_partial_count;
+  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
+  sb_partial_count++;
+  if(!_sb_partial){
+    sb_full_count=_fr->sb_full_count;
+    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
+    sb_full_count++;
+    _fr->sb_full_count=sb_full_count;
+    _fr->sb_full=_sb_full;
+    /*Roll back the partial block state.*/
+    _fr->b_coded=_fr->b_coded_prev;
+    _fr->b_coded_count=_fr->b_coded_count_prev;
+  }
+  else{
+    /*Commit back the partial block state.*/
+    _fr->b_coded_prev=_fr->b_coded;
+    _fr->b_coded_count_prev=_fr->b_coded_count;
+  }
+  _fr->sb_partial_count=sb_partial_count;
+  _fr->sb_partial=_sb_partial;
+  _fr->b_count=0;
+  _fr->sb_prefer_partial=0;
+  _fr->sb_bits=0;
+}
+
+/*Commit the state of the current super block and advance to the next.*/
+static void oc_fr_state_flush_sb(oc_fr_state *_fr){
+  int sb_partial;
+  int sb_full;
+  int b_coded_count;
+  int b_count;
+  b_count=_fr->b_count;
+  b_coded_count=_fr->b_coded_count;
+  sb_full=_fr->b_coded;
+  sb_partial=b_coded_count<b_count;
+  if(!sb_partial){
+    /*If the super block is fully coded/uncoded...*/
+    if(_fr->sb_prefer_partial){
+      /*So far coding this super block as partial was cheaper anyway.*/
+      if(b_coded_count>15||_fr->b_coded_prev<0){
+        int sb_bits;
+        /*If the block run is too long, this will limit how far it can be
+           extended into the next partial super block.
+          If we need to extend it farther, we don't want to have to roll all
+           the way back here (since there could be many full SBs between now
+           and then), so we disallow this.
+          Similarly, if this is the start of a stripe, we don't know how the
+           length of the outstanding block run from the previous stripe.*/
+        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
+        _fr->bits+=sb_bits-_fr->sb_bits;
+        _fr->sb_bits=sb_bits;
+      }
+      else sb_partial=1;
+    }
+  }
+  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
+}
+
+static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
+  ptrdiff_t bits;
+  int       sb_bits;
+  int       b_coded_count;
+  int       b_count;
+  int       sb_prefer_partial;
+  sb_bits=_fr->sb_bits;
+  bits=_fr->bits-sb_bits;
+  b_count=_fr->b_count;
+  b_coded_count=_fr->b_coded_count;
+  sb_prefer_partial=_fr->sb_prefer_partial;
+  if(b_coded_count>=b_count){
+    int sb_partial_bits;
+    /*This super block is currently fully coded/uncoded.*/
+    if(b_count<=0){
+      /*This is the first block in this SB.*/
+      b_count=1;
+      /*Check to see whether it's cheaper to code it partially or fully.*/
+      if(_fr->b_coded==_b_coded){
+        sb_partial_bits=-oc_block_run_bits(b_coded_count);
+        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
+      }
+      else{
+        b_coded_count=1;
+        sb_partial_bits=2;
+      }
+      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
+      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+      sb_prefer_partial=sb_partial_bits<sb_bits;
+      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
+    }
+    else if(_fr->b_coded==_b_coded){
+      b_coded_count++;
+      if(++b_count<16){
+        if(sb_prefer_partial){
+          /*Check to see if it's cheaper to code it fully.*/
+          sb_partial_bits=sb_bits;
+          sb_partial_bits+=oc_block_run_bits(b_coded_count);
+          if(b_coded_count>0){
+            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
+          }
+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+          sb_prefer_partial=sb_partial_bits<sb_bits;
+          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
+        }
+        /*There's no need to check the converse (whether it's cheaper to code
+           this SB partially if we were coding it fully), since the cost to
+           code a SB partially can only increase as we add more blocks, whereas
+           the cost to code it fully stays constant.*/
+      }
+      else{
+        /*If we get to the end and this SB is still full, then force it to be
+           coded full.
+          Otherwise we might not be able to extend the block run far enough
+           into the next partial SB.*/
+        if(sb_prefer_partial){
+          sb_prefer_partial=0;
+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
+        }
+      }
+    }
+    else{
+      /*This SB was full, but now must be made partial.*/
+      if(!sb_prefer_partial){
+        sb_bits=oc_block_run_bits(b_coded_count);
+        if(b_coded_count>b_count){
+          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
+        }
+        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
+      }
+      b_count++;
+      b_coded_count=1;
+      sb_prefer_partial=1;
+      sb_bits+=2;
+    }
+  }
+  else{
+    b_count++;
+    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
+    else b_coded_count=0;
+    sb_bits+=oc_block_run_bits(++b_coded_count);
+  }
+  _fr->bits=bits+sb_bits;
+  _fr->b_coded_count=b_coded_count;
+  _fr->b_coded=_b_coded;
+  _fr->b_count=b_count;
+  _fr->sb_prefer_partial=sb_prefer_partial;
+  _fr->sb_bits=sb_bits;
+}
+
+static void oc_fr_skip_block(oc_fr_state *_fr){
+  oc_fr_state_advance_block(_fr,0);
+}
+
+static void oc_fr_code_block(oc_fr_state *_fr){
+  oc_fr_state_advance_block(_fr,1);
+}
+
+static int oc_fr_cost1(const oc_fr_state *_fr){
+  oc_fr_state tmp;
+  ptrdiff_t   bits;
+  *&tmp=*_fr;
+  oc_fr_skip_block(&tmp);
+  bits=tmp.bits;
+  *&tmp=*_fr;
+  oc_fr_code_block(&tmp);
+  return (int)(tmp.bits-bits);
+}
+
+static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
+  oc_fr_state tmp;
+  *&tmp=*_pre;
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  return (int)(_post->bits-tmp.bits);
+}
+
+
+
+static void oc_qii_state_init(oc_qii_state *_qs){
+  _qs->bits=0;
+  _qs->qi01_count=0;
+  _qs->qi01=-1;
+  _qs->qi12_count=0;
+  _qs->qi12=-1;
+}
+
+
+static void oc_qii_state_advance(oc_qii_state *_qd,
+ const oc_qii_state *_qs,int _qii){
+  ptrdiff_t bits;
+  int       qi01;
+  int       qi01_count;
+  int       qi12;
+  int       qi12_count;
+  bits=_qs->bits;
+  qi01=_qii+1>>1;
+  qi01_count=_qs->qi01_count;
+  if(qi01==_qs->qi01){
+    if(qi01_count>=4129){
+      bits++;
+      qi01_count=0;
+    }
+    else bits-=oc_sb_run_bits(qi01_count);
+  }
+  else qi01_count=0;
+  qi01_count++;
+  bits+=oc_sb_run_bits(qi01_count);
+  qi12_count=_qs->qi12_count;
+  if(_qii){
+    qi12=_qii>>1;
+    if(qi12==_qs->qi12){
+      if(qi12_count>=4129){
+        bits++;
+        qi12_count=0;
+      }
+      else bits-=oc_sb_run_bits(qi12_count);
+    }
+    else qi12_count=0;
+    qi12_count++;
+    bits+=oc_sb_run_bits(qi12_count);
+  }
+  else qi12=_qs->qi12;
+  _qd->bits=bits;
+  _qd->qi01=qi01;
+  _qd->qi01_count=qi01_count;
+  _qd->qi12=qi12;
+  _qd->qi12_count=qi12_count;
+}
+
+
+
+static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
+  ptrdiff_t *coded_fragis;
+  unsigned   mcu_nvsbs;
+  ptrdiff_t  mcu_nfrags;
+  int        flimit;
+  int        hdec;
+  int        vdec;
+  int        pli;
+  int        nqis;
+  int        qii;
+  int        qi0;
+  int        qti;
+  /*Initialize the per-plane coded block flag trackers.
+    These are used for bit-estimation purposes only; the real flag bits span
+     all three planes, so we can't compute them in parallel.*/
+  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
+  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
+  /*Set up the per-plane skip SSD storage pointers.*/
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
+  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
+  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
+  /*Set up per-plane pointers to the coded and uncoded fragments lists.
+    Unlike the decoder, each planes' coded and uncoded fragment list is kept
+     separate during the analysis stage; we only make the coded list for all
+     three planes contiguous right before the final packet is output
+     (destroying the uncoded lists, which are no longer needed).*/
+  coded_fragis=_enc->state.coded_fragis;
+  for(pli=0;pli<3;pli++){
+    _pipe->coded_fragis[pli]=coded_fragis;
+    coded_fragis+=_enc->state.fplanes[pli].nfrags;
+    _pipe->uncoded_fragis[pli]=coded_fragis;
+  }
+  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
+  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
+  /*Set up condensed quantizer tables.*/
+  qi0=_enc->state.qis[0];
+  nqis=_enc->state.nqis;
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<nqis;qii++){
+      int qi;
+      qi=_enc->state.qis[qii];
+      for(qti=0;qti<2;qti++){
+        /*Set the DC coefficient in the dequantization table.*/
+        _enc->state.dequant_tables[qi][pli][qti][0]=
+         _enc->dequant_dc[qi0][pli][qti];
+        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+        /*Copy over the quantization table.*/
+        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
+         _enc->opt_data.enquant_table_size);
+      }
+    }
+  }
+  /*Fix up the DC coefficients in the quantization tables.*/
+  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
+  /*Initialize the tokenization state.*/
+  for(pli=0;pli<3;pli++){
+    _pipe->ndct_tokens1[pli]=0;
+    _pipe->eob_run1[pli]=0;
+  }
+  /*Initialize the bounding value array for the loop filter.*/
+  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
+  /*Clear the temporary DCT scratch space.*/
+  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
+}
+
+/*Sets the current MCU stripe to super block row _sby.
+  Return: A non-zero value if this was the last MCU.*/
+static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _sby){
+  const oc_fragment_plane *fplane;
+  unsigned                 mcu_nvsbs;
+  int                      sby_end;
+  int                      notdone;
+  int                      vdec;
+  int                      pli;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  sby_end=_enc->state.fplanes[0].nvsbs;
+  notdone=_sby+mcu_nvsbs<sby_end;
+  if(notdone)sby_end=_sby+mcu_nvsbs;
+  vdec=0;
+  for(pli=0;pli<3;pli++){
+    fplane=_enc->state.fplanes+pli;
+    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
+    _pipe->fragy0[pli]=_sby<<2-vdec;
+    _pipe->froffset[pli]=fplane->froffset
+     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
+    if(notdone){
+      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
+      _pipe->fragy_end[pli]=sby_end<<2-vdec;
+    }
+    else{
+      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
+      _pipe->fragy_end[pli]=fplane->nvfrags;
+    }
+    vdec=!(_enc->state.info.pixel_fmt&2);
+  }
+  return notdone;
+}
+
+static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
+  /*Copy over all the uncoded fragments from this plane and advance the uncoded
+     fragment list.*/
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_enc->state,
+     _enc->state.ref_frame_data[OC_FRAME_SELF],
+     _enc->state.ref_frame_data[OC_FRAME_PREV],
+     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
+    _pipe->nuncoded_fragis[_pli]=0;
+  }
+  /*Perform DC prediction.*/
+  oc_enc_pred_dc_frag_rows(_enc,_pli,
+   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
+  /*Finish DC tokenization.*/
+  oc_enc_tokenize_dc_frag_list(_enc,_pli,
+   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
+   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
+  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
+  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
+  /*And advance the coded fragment list.*/
+  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->ncoded_fragis[_pli]=0;
+  /*Apply the loop filter if necessary.*/
+  if(_pipe->loop_filter){
+    oc_state_loop_filter_frag_rows(&_enc->state,
+     _pipe->bounding_values,OC_FRAME_SELF,_pli,
+     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+  }
+  else _sdelay=_edelay=0;
+  /*To fill borders, we have an additional two pixel delay, since a fragment
+     in the next row could filter its top edge, using two pixels from a
+     fragment in this row.
+    But there's no reason to delay a full fragment between the two.*/
+  oc_state_borders_fill_rows(&_enc->state,
+   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
+   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
+   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
+}
+
+
+
+/*Cost information about the coded blocks in a MB.*/
+struct oc_rd_metric{
+  int uncoded_ac_ssd;
+  int coded_ac_ssd;
+  int ac_bits;
+  int dc_flag;
+};
+
+
+
+static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
+ unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
+ oc_fr_state *_fr,oc_token_checkpoint **_stack){
+  ogg_int16_t            *data;
+  ogg_int16_t            *dct;
+  ogg_int16_t            *idct;
+  oc_qii_state            qs;
+  const ogg_uint16_t     *dequant;
+  ogg_uint16_t            dequant_dc;
+  ptrdiff_t               frag_offs;
+  int                     ystride;
+  const unsigned char    *src;
+  const unsigned char    *ref;
+  unsigned char          *dst;
+  int                     nonzero;
+  unsigned                uncoded_ssd;
+  unsigned                coded_ssd;
+  oc_token_checkpoint    *checkpoint;
+  oc_fragment            *frags;
+  int                     mb_mode;
+  int                     refi;
+  int                     mv_offs[2];
+  int                     nmv_offs;
+  int                     ac_bits;
+  int                     borderi;
+  int                     nqis;
+  int                     qti;
+  int                     qii;
+  int                     dc;
+  nqis=_enc->state.nqis;
+  frags=_enc->state.frags;
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  ystride=_enc->state.ref_ystride[_pli];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
+  borderi=frags[_fragi].borderi;
+  qii=frags[_fragi].qii;
+  data=_enc->pipe.dct_data;
+  dct=data+64;
+  idct=data+128;
+  if(qii&~3){
+#if !defined(OC_COLLECT_METRICS)
+    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
+      /*Enable early skip detection.*/
+      frags[_fragi].coded=0;
+      frags[_fragi].refi=OC_FRAME_NONE;
+      oc_fr_skip_block(_fr);
+      return 0;
+    }
+#endif
+    /*Try and code this block anyway.*/
+    qii&=3;
+  }
+  refi=frags[_fragi].refi;
+  mb_mode=frags[_fragi].mb_mode;
+  ref=_enc->state.ref_frame_data[refi]+frag_offs;
+  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
+  /*Motion compensation:*/
+  switch(mb_mode){
+    case OC_MODE_INTRA:{
+      nmv_offs=0;
+      oc_enc_frag_sub_128(_enc,data,src,ystride);
+    }break;
+    case OC_MODE_GOLDEN_NOMV:
+    case OC_MODE_INTER_NOMV:{
+      nmv_offs=1;
+      mv_offs[0]=0;
+      oc_enc_frag_sub(_enc,data,src,ref,ystride);
+    }break;
+    default:{
+      const oc_mv *frag_mvs;
+      frag_mvs=_enc->state.frag_mvs;
+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
+       _pli,frag_mvs[_fragi]);
+      if(nmv_offs>1){
+        oc_enc_frag_copy2(_enc,dst,
+         ref+mv_offs[0],ref+mv_offs[1],ystride);
+        oc_enc_frag_sub(_enc,data,src,dst,ystride);
+      }
+      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
+    }break;
+  }
+#if defined(OC_COLLECT_METRICS)
+  {
+    unsigned sad;
+    unsigned satd;
+    switch(nmv_offs){
+      case 0:{
+        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
+        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
+      }break;
+      case 1:{
+        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
+        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
+        satd+=abs(dc);
+      }break;
+      default:{
+        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
+        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
+        satd+=abs(dc);
+      }break;
+    }
+    _enc->frag_sad[_fragi]=sad;
+    _enc->frag_satd[_fragi]=satd;
+  }
+#endif
+  /*Transform:*/
+  oc_enc_fdct8x8(_enc,dct,data);
+  /*Quantize:*/
+  qti=mb_mode!=OC_MODE_INTRA;
+  dequant=_enc->dequant[_pli][qii][qti];
+  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
+  dc=data[0];
+  /*Tokenize.*/
+  checkpoint=*_stack;
+  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
+  else{
+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
+  /*Reconstruct.
+    TODO: nonzero may need to be adjusted after tokenization.*/
+  dequant_dc=dequant[0];
+  if(nonzero==0){
+    ogg_int16_t p;
+    int         ci;
+    int         qi01;
+    int         qi12;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)data[ci]=p;
+    /*We didn't code any AC coefficients, so don't change the quantizer.*/
+    qi01=_pipe->qs[_pli].qi01;
+    qi12=_pipe->qs[_pli].qi12;
+    if(qi01>0)qii=1+qi12;
+    else if(qi01>=0)qii=0;
+  }
+  else{
+    idct[0]=dc*dequant_dc;
+    /*Note: This clears idct[] back to zero for the next block.*/
+    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
+  }
+  frags[_fragi].qii=qii;
+  if(nqis>1){
+    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
+    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
+  }
+  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
+  else{
+    oc_enc_frag_recon_inter(_enc,dst,
+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
+  }
+  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
+#if !defined(OC_COLLECT_METRICS)
+  if(_fr!=NULL)
+#endif
+  {
+    /*In retrospect, should we have skipped this block?*/
+    if(borderi<0){
+      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
+    }
+    else{
+      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
+       _enc->state.borders[borderi].mask);
+    }
+    /*Scale to match DCT domain.*/
+    coded_ssd<<=4;
+#if defined(OC_COLLECT_METRICS)
+    _enc->frag_ssd[_fragi]=coded_ssd;
+  }
+  if(_fr!=NULL){
+#endif
+    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
+    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
+    if(uncoded_ssd<UINT_MAX&&
+     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
+        is enabled.*/
+     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
+      int overhead_bits;
+      overhead_bits=oc_fr_cost1(_fr);
+      /*Although the fragment coding overhead determination is accurate, it is
+         greedy, using very coarse-grained local information.
+        Allowing it to mildly discourage coding turns out to be beneficial, but
+         it's not clear that allowing it to encourage coding through negative
+         coding overhead deltas is useful.
+        For that reason, we disallow negative coding overheads.*/
+      if(overhead_bits<0)overhead_bits=0;
+      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
+        /*Hm, not worth it; roll back.*/
+        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
+        *_stack=checkpoint;
+        frags[_fragi].coded=0;
+        frags[_fragi].refi=OC_FRAME_NONE;
+        oc_fr_skip_block(_fr);
+        return 0;
+      }
+    }
+    else _mo->dc_flag=1;
+    _mo->uncoded_ac_ssd+=uncoded_ssd;
+    _mo->coded_ac_ssd+=coded_ssd;
+    _mo->ac_bits+=ac_bits;
+    oc_fr_code_block(_fr);
+  }
+  /*GCC 4.4.4 generates a warning here because it can't tell that
+     the init code in the nqis check above will run anytime this
+     line runs.*/
+  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
+  frags[_fragi].dc=dc;
+  frags[_fragi].coded=1;
+  return 1;
+}
+
+static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
+  /*Worst case token stack usage for 4 fragments.*/
+  oc_token_checkpoint  stack[64*4];
+  oc_token_checkpoint *stackptr;
+  const oc_sb_map     *sb_maps;
+  signed char         *mb_modes;
+  oc_fragment         *frags;
+  ptrdiff_t           *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t           *uncoded_fragis;
+  ptrdiff_t            nuncoded_fragis;
+  oc_rd_metric         mo;
+  oc_fr_state          fr_checkpoint;
+  oc_qii_state         qs_checkpoint;
+  int                  mb_mode;
+  int                  refi;
+  int                  ncoded;
+  ptrdiff_t            fragi;
+  int                  bi;
+  *&fr_checkpoint=*(_pipe->fr+0);
+  *&qs_checkpoint=*(_pipe->qs+0);
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_modes=_enc->state.mb_modes;
+  frags=_enc->state.frags;
+  coded_fragis=_pipe->coded_fragis[0];
+  ncoded_fragis=_pipe->ncoded_fragis[0];
+  uncoded_fragis=_pipe->uncoded_fragis[0];
+  nuncoded_fragis=_pipe->nuncoded_fragis[0];
+  mb_mode=mb_modes[_mbi];
+  refi=OC_FRAME_FOR_MODE(mb_mode);
+  ncoded=0;
+  stackptr=stack;
+  memset(&mo,0,sizeof(mo));
+  for(bi=0;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].refi=refi;
+    frags[fragi].mb_mode=mb_mode;
+    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
+     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
+      coded_fragis[ncoded_fragis++]=fragi;
+      ncoded++;
+    }
+    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+  }
+  if(ncoded>0&&!mo.dc_flag){
+    int cost;
+    /*Some individual blocks were worth coding.
+      See if that's still true when accounting for mode and MV overhead.*/
+    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
+     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
+    if(mo.uncoded_ac_ssd<=cost){
+      /*Taking macroblock overhead into account, it is not worth coding this
+         MB.*/
+      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
+      *(_pipe->fr+0)=*&fr_checkpoint;
+      *(_pipe->qs+0)=*&qs_checkpoint;
+      for(bi=0;bi<4;bi++){
+        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+        if(frags[fragi].coded){
+          *(uncoded_fragis-++nuncoded_fragis)=fragi;
+          frags[fragi].coded=0;
+          frags[fragi].refi=OC_FRAME_NONE;
+        }
+        oc_fr_skip_block(_pipe->fr+0);
+      }
+      ncoded_fragis-=ncoded;
+      ncoded=0;
+    }
+  }
+  /*If no luma blocks coded, the mode is forced.*/
+  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
+     with a single coded block.
+    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
+     skipped blocks, while a 1MV does not.*/
+  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
+    mb_modes[_mbi]=OC_MODE_INTER_MV;
+  }
+  _pipe->ncoded_fragis[0]=ncoded_fragis;
+  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
+  return ncoded;
+}
+
+static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
+  const ogg_uint16_t *mcu_rd_scale;
+  const ogg_uint16_t *mcu_rd_iscale;
+  const oc_sb_map    *sb_maps;
+  oc_sb_flags        *sb_flags;
+  oc_fr_state        *fr;
+  ptrdiff_t          *coded_fragis;
+  ptrdiff_t           ncoded_fragis;
+  ptrdiff_t          *uncoded_fragis;
+  ptrdiff_t           nuncoded_fragis;
+  ptrdiff_t           froffset;
+  int                 sbi;
+  fr=_pipe->fr+_pli;
+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  uncoded_fragis=_pipe->uncoded_fragis[_pli];
+  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
+  froffset=_pipe->froffset[_pli];
+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
+    /*Worst case token stack usage for 1 fragment.*/
+    oc_token_checkpoint stack[64];
+    oc_rd_metric        mo;
+    int                 quadi;
+    int                 bi;
+    memset(&mo,0,sizeof(mo));
+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
+      ptrdiff_t fragi;
+      fragi=sb_maps[sbi][quadi][bi];
+      if(fragi>=0){
+        oc_token_checkpoint *stackptr;
+        unsigned             rd_scale;
+        unsigned             rd_iscale;
+        rd_scale=mcu_rd_scale[fragi-froffset];
+        rd_iscale=mcu_rd_iscale[fragi-froffset];
+        stackptr=stack;
+        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
+         rd_scale,rd_iscale,&mo,fr,&stackptr)){
+          coded_fragis[ncoded_fragis++]=fragi;
+        }
+        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+      }
+    }
+    oc_fr_state_flush_sb(fr);
+    sb_flags[sbi].coded_fully=fr->sb_full;
+    sb_flags[sbi].coded_partially=fr->sb_partial;
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
+}
+
+/*Mode decision is done by exhaustively examining all potential choices.
+  Obviously, doing the motion compensation, fDCT, tokenization, and then
+   counting the bits each token uses is computationally expensive.
+  Theora's EOB runs can also split the cost of these tokens across multiple
+   fragments, and naturally we don't know what the optimal choice of Huffman
+   codes will be until we know all the tokens we're going to encode in all the
+   fragments.
+  So we use a simple approach to estimating the bit cost and distortion of each
+   mode based upon the SATD value of the residual before coding.
+  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
+   the process (modified somewhat from that of the paper) is very simple.
+  We build a non-linear regression of the mappings from
+   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
+   SSD for each qi.
+  A separate set of mappings is kept for each quantization type and color
+   plane.
+  The mappings are constructed by partitioning the SATD values into a small
+   number of bins (currently 24) and using a linear regression in each bin
+   (as opposed to the 0th-order regression used by Kim).
+  The bit counts and SSD measurements are obtained by examining actual encoded
+   frames, with appropriate lambda values and optimal Huffman codes selected.
+  EOB bits are assigned to the fragment that started the EOB run (as opposed to
+   dividing them among all the blocks in the run; the latter approach seems
+   more theoretically correct, but Monty's testing showed a small improvement
+   with the former, though that may have been merely statistical noise).
+
+  @ARTICLE{Kim03,
+    author="Hyun Mun Kim",
+    title="Adaptive Rate Control Using Nonlinear Regression",
+    journal="IEEE Transactions on Circuits and Systems for Video Technology",
+    volume=13,
+    number=5,
+    pages="432--439",
+    month=May,
+    year=2003
+  }*/
+
+/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
+   overflow for large lambda values.*/
+#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
+ ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
+ +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
+ +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
+
+static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
+#if !defined(OC_COLLECT_METRICS)
+  const
+#endif
+  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
+   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
+  int qii;
+#if defined(OC_COLLECT_METRICS)
+  oc_enc_mode_metrics_load(_enc);
+#endif
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    int qi;
+    int pli;
+    qi=_enc->state.qis[qii];
+    for(pli=0;pli<3;pli++){
+      int qti;
+      for(qti=0;qti<2;qti++){
+        int log_plq;
+        int modeline;
+        int bin;
+        int dx;
+        int dq;
+        log_plq=_enc->log_plq[qi][pli][qti];
+        /*Find the pair of rows in the mode table that bracket this quantizer.
+          If it falls outside the range the table covers, then we just use a
+           pair on the edge for linear extrapolation.*/
+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
+        /*Interpolate a row for this quantizer.*/
+        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
+        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
+        if(dq==0)dq=1;
+        for(bin=0;bin<OC_COMP_BINS;bin++){
+          int y0;
+          int z0;
+          int dy;
+          int dz;
+          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
+          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
+          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
+          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
+          _enc->mode_rd[qii][pli][qti][bin].rate=
+           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
+          _enc->mode_rd[qii][pli][qti][bin].rmse=
+           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
+        }
+      }
+    }
+  }
+}
+
+/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
+   prediction.*/
+static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
+ int _qii,int _pli,int _qti,int _satd){
+  unsigned rmse;
+  int      shift;
+  int      bin;
+  int      dx;
+  int      y0;
+  int      z0;
+  int      dy;
+  int      dz;
+  /*SATD metrics for chroma planes vary much less than luma, so we scale them
+     by 4 to distribute them into the mode decision bins more evenly.*/
+  _satd<<=_pli+1&2;
+  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
+  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
+  dx=_satd-(bin<<shift);
+  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
+  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
+  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
+  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
+  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
+  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
+  return OC_MAXI(y0+(dy*dx>>shift),0);
+}
+
+/*activity_avg must be positive, or flat regions could get a zero weight, which
+   confounds analysis.
+  We set the minimum to this value so that it also avoids the need for divide
+   by zero checks in oc_mb_masking().*/
+# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
+
+static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4]){
+  const unsigned char *src;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *sb_map;
+  unsigned             luma;
+  int                  ystride;
+  ptrdiff_t            frag_offs;
+  ptrdiff_t            fragi;
+  int                  bi;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  luma=0;
+  for(bi=0;bi<4;bi++){
+    const unsigned char *s;
+    unsigned             x;
+    unsigned             x2;
+    unsigned             act;
+    int                  i;
+    int                  j;
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    /*TODO: This could be replaced with SATD^2, since we already have to
+       compute SATD.*/
+    x=x2=0;
+    s=src+frag_offs;
+    for(i=0;i<8;i++){
+      for(j=0;j<8;j++){
+        unsigned c;
+        c=s[j];
+        x+=c;
+        x2+=c*c;
+      }
+      s+=ystride;
+    }
+    luma+=x;
+    act=(x2<<6)-x*x;
+    if(act<8<<12){
+      /*The region is flat.*/
+      act=OC_MINI(act,5<<12);
+    }
+    else{
+      unsigned e1;
+      unsigned e2;
+      unsigned e3;
+      unsigned e4;
+      /*Test for an edge.
+        TODO: There are probably much simpler ways to do this (e.g., it could
+         probably be combined with the SATD calculation).
+        Alternatively, we could split the block around the mean and compute the
+         reduction in variance in each half.
+        For a Gaussian source the reduction should be
+         (1-2/pi) ~= 0.36338022763241865692446494650994.
+        Significantly more reduction is a good indication of a bi-level image.
+        This has the advantage of identifying, in addition to straight edges,
+         small text regions, which would otherwise be classified as "texture".*/
+      e1=e2=e3=e4=0;
+      s=src+frag_offs-1;
+      for(i=0;i<8;i++){
+        for(j=0;j<8;j++){
+          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
+           +(s+ystride)[j+2]-(s+ystride)[j]);
+          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
+           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
+          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
+           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
+          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
+           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
+        }
+        s+=ystride;
+      }
+      /*If the largest component of the edge energy is at least 40% of the
+         total, then classify the block as an edge block.*/
+      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
+         /*act=act_th*(act/act_th)**0.7
+              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
+           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
+         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
+      }
+    }
+    _activity[bi]=act;
+  }
+  return luma;
+}
+
+static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4],const unsigned _intra_satd[12]){
+  int bi;
+  for(bi=0;bi<4;bi++){
+    unsigned act;
+    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
+    if(act<8<<12){
+      /*The region is flat.*/
+      act=OC_MINI(act,5<<12);
+    }
+    _activity[bi]=act;
+  }
+}
+
+/*Compute the masking scales for the blocks in a macro block.
+  All masking is computed from the luma blocks.
+  We derive scaling factors for the chroma blocks from these, and use the same
+   ones for all chroma blocks, regardless of the subsampling.
+  It's possible for luma to be perfectly flat and yet have high chroma energy,
+   but this is unlikely in non-artificial images, and not a case that has been
+   addressed by any research to my knowledge.
+  The output of the masking process is two scale factors, which are fed into
+   the various R-D optimizations.
+  The first, rd_scale, is applied to D in the equation
+    D*rd_scale+lambda*R.
+  This is the form that must be used to properly combine scores from multiple
+   blocks, and can be interpreted as scaling distortions by their visibility.
+  The inverse, rd_iscale, is applied to lambda in the equation
+    D+rd_iscale*lambda*R.
+  This is equivalent to the first form within a single block, but much faster
+   to use when evaluating many possible distortions (e.g., during actual
+   quantization, where separate distortions are evaluated for every
+   coefficient).
+  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
+   used to perform the multiplications with the proper re-scaling for the range
+   of the scaling factors.
+  Many researchers apply masking values directly to the quantizers used, and
+   not to the R-D cost.
+  Since we generally use MSE for D, rd_scale must use the square of their
+   values to generate an equivalent effect.*/
+static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
+ const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
+ unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
+  unsigned activity_sum;
+  unsigned la;
+  unsigned lb;
+  unsigned d;
+  int      bi;
+  int      bi_min;
+  int      bi_min2;
+  /*The ratio lb/la is meant to approximate
+     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
+     effective luminance masking from~\cite{LKW06} (including the self-masking
+     deflator).
+    The following actually turns out to be a pretty good approximation for
+     _luma>75 or so.
+    For smaller values luminance does not really follow Weber's Law anyway, and
+     this approximation gives a much less aggressive bitrate boost in this
+     region.
+    Though some researchers claim that contrast sensitivity actually decreases
+     for very low luminance values, in my experience excessive brightness on
+     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
+     of the CCIR 601 range) make artifacts in such regions extremely visible.
+    We substitute _luma_avg for 128 to allow the strength of the masking to
+     vary with the actual average image luminance, within certain limits (the
+     caller has clamped _luma_avg to the range [90,160], inclusive).
+    @ARTICLE{LKW06,
+      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
+      title="{JPEG2000} Encoding With Perceptual Distortion Control",
+      journal="{IEEE} Transactions on Image Processing",
+      volume=15,
+      number=7,
+      pages="1763--1778",
+      month=Jul,
+      year=2006
+    }*/
+#if 0
+  la=_luma+4*_luma_avg;
+  lb=4*_luma+_luma_avg;
+#else
+  /*Disable luminance masking.*/
+  la=lb=1;
+#endif
+  activity_sum=0;
+  for(bi=0;bi<4;bi++){
+    unsigned a;
+    unsigned b;
+    activity_sum+=_activity[bi];
+    /*Apply activity masking.*/
+    a=_activity[bi]+4*_activity_avg;
+    b=4*_activity[bi]+_activity_avg;
+    d=OC_RD_SCALE(b,1);
+    /*And luminance masking.*/
+    d=(a+(d>>1))/d;
+    _rd_scale[bi]=(d*la+(lb>>1))/lb;
+    /*And now the inverse.*/
+    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
+    d=(b+(d>>1))/d;
+    _rd_iscale[bi]=(d*lb+(la>>1))/la;
+  }
+  /*Now compute scaling factors for chroma blocks.
+    We start by finding the two smallest iscales from the luma blocks.*/
+  bi_min=_rd_iscale[1]<_rd_iscale[0];
+  bi_min2=1-bi_min;
+  for(bi=2;bi<4;bi++){
+    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
+      bi_min2=bi_min;
+      bi_min=bi;
+    }
+    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
+  }
+  /*If the minimum iscale is less than 1.0, use the second smallest instead,
+     and force the value to at least 1.0 (inflating chroma is a waste).*/
+  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
+  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
+  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
+  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
+  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
+  return activity_sum;
+}
+
+static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _frag_satd[12]){
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    ystride;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  unsigned               luma;
+  int                    dc;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  luma=0;
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+    luma+=dc;
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  return luma;
+}
+
+/*Select luma block-level quantizers for a MB in an INTRA frame.*/
+static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
+ const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
+  const unsigned char *src;
+  const ptrdiff_t     *frag_buf_offs;
+  const oc_sb_map     *sb_maps;
+  oc_fragment         *frags;
+  ptrdiff_t            frag_offs;
+  ptrdiff_t            fragi;
+  oc_qii_state         qs[4][3];
+  unsigned             cost[4][3];
+  unsigned             ssd[4][3];
+  unsigned             rate[4][3];
+  int                  prev[3][3];
+  unsigned             satd;
+  int                  dc;
+  unsigned             best_cost;
+  unsigned             best_ssd;
+  unsigned             best_rate;
+  int                  best_qii;
+  int                  qii;
+  int                  lambda;
+  int                  ystride;
+  int                  nqis;
+  int                  bi;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  fragi=sb_maps[_mbi>>2][_mbi&3][0];
+  frag_offs=frag_buf_offs[fragi];
+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  else{
+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+  }
+  nqis=_enc->state.nqis;
+  lambda=_enc->lambda;
+  for(qii=0;qii<nqis;qii++){
+    oc_qii_state_advance(qs[0]+qii,_qs,qii);
+    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
+     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
+    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
+    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
+  }
+  for(bi=1;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frag_offs=frag_buf_offs[fragi];
+    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+    }
+    else{
+      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+    }
+    for(qii=0;qii<nqis;qii++){
+      oc_qii_state qt[3];
+      unsigned     cur_ssd;
+      unsigned     cur_rate;
+      int          best_qij;
+      int          qij;
+      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
+      best_ssd=ssd[bi-1][0]+cur_ssd;
+      best_rate=rate[bi-1][0]+cur_rate
+       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
+      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
+      best_qij=0;
+      for(qij=1;qij<nqis;qij++){
+        unsigned chain_ssd;
+        unsigned chain_rate;
+        unsigned chain_cost;
+        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
+        chain_ssd=ssd[bi-1][qij]+cur_ssd;
+        chain_rate=rate[bi-1][qij]+cur_rate
+         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
+        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
+        if(chain_cost<best_cost){
+          best_cost=chain_cost;
+          best_ssd=chain_ssd;
+          best_rate=chain_rate;
+          best_qij=qij;
+        }
+      }
+      *(qs[bi]+qii)=*(qt+best_qij);
+      cost[bi][qii]=best_cost;
+      ssd[bi][qii]=best_ssd;
+      rate[bi][qii]=best_rate;
+      prev[bi-1][qii]=best_qij;
+    }
+  }
+  best_qii=0;
+  best_cost=cost[3][0];
+  for(qii=1;qii<nqis;qii++){
+    if(cost[3][qii]<best_cost){
+      best_cost=cost[3][qii];
+      best_qii=qii;
+    }
+  }
+  frags=_enc->state.frags;
+  for(bi=3;;){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].qii=best_qii;
+    if(bi--<=0)break;
+    best_qii=prev[bi][best_qii];
+  }
+  return best_cost;
+}
+
+/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
+static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
+ const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
+  const unsigned char *src;
+  oc_fragment         *frags;
+  ptrdiff_t            frag_offs;
+  oc_qii_state         qt[3];
+  unsigned             cost[3];
+  unsigned             satd;
+  int                  dc;
+  unsigned             best_cost;
+  int                  best_qii;
+  int                  qii;
+  int                  lambda;
+  int                  ystride;
+  int                  nqis;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[_pli];
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  else{
+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
+  }
+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
+     worth spending the bits to change the AC quantizer.
+    TODO: This may be worth revisiting when we separate out DC and AC
+     predictions from SATD.*/
+#if 0
+  nqis=_enc->state.nqis;
+#else
+  nqis=1;
+#endif
+  lambda=_enc->lambda;
+  best_qii=0;
+  for(qii=0;qii<nqis;qii++){
+    unsigned cur_rate;
+    unsigned cur_ssd;
+    oc_qii_state_advance(qt+qii,_qs,qii);
+    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
+     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
+    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
+    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
+  }
+  best_cost=cost[0];
+  for(qii=1;qii<nqis;qii++){
+    if(cost[qii]<best_cost){
+      best_cost=cost[qii];
+      best_qii=qii;
+    }
+  }
+  frags=_enc->state.frags;
+  frags[_fragi].qii=best_qii;
+  return best_cost;
+}
+
+static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,unsigned _mbi,
+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
+  /*Worst case token stack usage for 4 fragments.*/
+  oc_token_checkpoint  stack[64*4];
+  oc_token_checkpoint *stackptr;
+  const oc_sb_map     *sb_maps;
+  oc_fragment         *frags;
+  ptrdiff_t           *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t            fragi;
+  int                  bi;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  frags=_enc->state.frags;
+  coded_fragis=_pipe->coded_fragis[0];
+  ncoded_fragis=_pipe->ncoded_fragis[0];
+  stackptr=stack;
+  for(bi=0;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].refi=OC_FRAME_SELF;
+    frags[fragi].mb_mode=OC_MODE_INTRA;
+    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
+     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
+    coded_fragis[ncoded_fragis++]=fragi;
+  }
+  _pipe->ncoded_fragis[0]=ncoded_fragis;
+}
+
+static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
+  const ogg_uint16_t *mcu_rd_scale;
+  const ogg_uint16_t *mcu_rd_iscale;
+  const oc_sb_map    *sb_maps;
+  ptrdiff_t          *coded_fragis;
+  ptrdiff_t           ncoded_fragis;
+  ptrdiff_t           froffset;
+  int                 sbi;
+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  froffset=_pipe->froffset[_pli];
+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
+    /*Worst case token stack usage for 1 fragment.*/
+    oc_token_checkpoint stack[64];
+    int                 quadi;
+    int                 bi;
+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
+      ptrdiff_t fragi;
+      fragi=sb_maps[sbi][quadi][bi];
+      if(fragi>=0){
+        oc_token_checkpoint *stackptr;
+        unsigned             rd_scale;
+        unsigned             rd_iscale;
+        rd_scale=mcu_rd_scale[fragi-froffset];
+        rd_iscale=mcu_rd_iscale[fragi-froffset];
+        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
+        stackptr=stack;
+        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
+         rd_scale,rd_iscale,NULL,NULL,&stackptr);
+        coded_fragis[ncoded_fragis++]=fragi;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+}
+
+/*Analysis stage for an INTRA frame.*/
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
+  ogg_int64_t             activity_sum;
+  ogg_int64_t             luma_sum;
+  unsigned                activity_avg;
+  unsigned                luma_avg;
+  const ogg_uint16_t     *chroma_rd_scale;
+  ogg_uint16_t           *mcu_rd_scale;
+  ogg_uint16_t           *mcu_rd_iscale;
+  const unsigned char    *map_idxs;
+  int                     nmap_idxs;
+  oc_sb_flags            *sb_flags;
+  signed char            *mb_modes;
+  const oc_mb_map        *mb_maps;
+  const oc_sb_map        *sb_maps;
+  oc_fragment            *frags;
+  unsigned                stripe_sby;
+  unsigned                mcu_nvsbs;
+  int                     notstart;
+  int                     notdone;
+  int                     refi;
+  int                     pli;
+  _enc->state.frame_type=OC_INTRA_FRAME;
+  oc_enc_tokenize_start(_enc);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
+  oc_enc_mode_rd_init(_enc);
+  activity_sum=luma_sum=0;
+  activity_avg=_enc->activity_avg;
+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
+  mcu_rd_scale=_enc->mcu_rd_scale;
+  mcu_rd_iscale=_enc->mcu_rd_iscale;
+  /*Choose MVs and MB modes and quantize and code luma.
+    Must be done in Hilbert order.*/
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  _enc->state.ncoded_fragis[0]=0;
+  _enc->state.ncoded_fragis[1]=0;
+  _enc->state.ncoded_fragis[2]=0;
+  sb_flags=_enc->state.sb_flags;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  frags=_enc->state.frags;
+  notstart=0;
+  notdone=1;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
+    ptrdiff_t cfroffset;
+    unsigned  sbi;
+    unsigned  sbi_end;
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
+      int quadi;
+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        unsigned  activity[4];
+        unsigned  rd_scale[5];
+        unsigned  rd_iscale[5];
+        unsigned  luma;
+        unsigned  mbi;
+        int       mapii;
+        int       mapi;
+        int       bi;
+        ptrdiff_t fragi;
+        mbi=sbi<<2|quadi;
+        /*Activity masking.*/
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          luma=oc_mb_activity(_enc,mbi,activity);
+        }
+        else{
+          unsigned intra_satd[12];
+          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
+        }
+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
+        luma_sum+=luma;
+        /*Motion estimation:
+          We do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not, unless we aren't using motion estimation at all.*/
+        if(!_recode&&_enc->state.curframe_num>0&&
+         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
+          oc_mcenc_search(_enc,mbi);
+        }
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
+        }
+        mb_modes[mbi]=OC_MODE_INTRA;
+        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
+         mbi,rd_scale,rd_iscale);
+        /*Propagate final MB mode and MVs to the chroma blocks.*/
+        for(mapii=4;mapii<nmap_idxs;mapii++){
+          mapi=map_idxs[mapii];
+          pli=mapi>>2;
+          bi=mapi&3;
+          fragi=mb_maps[mbi][pli][bi];
+          frags[fragi].refi=OC_FRAME_SELF;
+          frags[fragi].mb_mode=OC_MODE_INTRA;
+        }
+        /*Save masking scale factors for chroma blocks.*/
+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][1][bi];
+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
+        }
+      }
+    }
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
+    /*Code chroma planes.*/
+    for(pli=1;pli<3;pli++){
+      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
+    }
+    notstart=1;
+  }
+  /*Compute the average block activity and MB luma score for the frame.*/
+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
+   _enc->state.fplanes[0].nfrags));
+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
+  /*Finish filling in the reference frame borders.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
+}
+
+
+
+/*Cost information about a MB mode.*/
+struct oc_mode_choice{
+  unsigned      cost;
+  unsigned      ssd;
+  unsigned      rate;
+  unsigned      overhead;
+  unsigned char qii[12];
+};
+
+
+
+static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
+  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
+   _modec->rate+_modec->overhead,_lambda);
+}
+
+/*A set of skip SSD's to use to disable early skipping.*/
+static const unsigned OC_NOSKIP[12]={
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
+};
+
+/*The estimated number of bits used by a coded chroma block to specify the AC
+   quantizer.
+  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
+   measurements suggest this is in the right ballpark, but it varies somewhat
+   with lambda.*/
+#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
+
+static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[4],int _qti){
+  oc_fr_state  fr;
+  oc_qii_state qs;
+  unsigned     ssd;
+  unsigned     rate;
+  unsigned     satd;
+  unsigned     best_ssd;
+  unsigned     best_rate;
+  int          best_fri;
+  int          best_qii;
+  int          lambda;
+  int          nqis;
+  int          nskipped;
+  int          bi;
+  lambda=_enc->lambda;
+  nqis=_enc->state.nqis;
+  /*We could do a trellis optimization here, but we don't make final skip
+     decisions until after transform+quantization, so the result wouldn't be
+     optimal anyway.
+    Instead we just use a greedy approach; for most SATD values, the
+     differences between the qiis are large enough to drown out the cost to
+     code the flags, anyway.*/
+  *&fr=*_fr;
+  *&qs=*_qs;
+  ssd=rate=nskipped=0;
+  for(bi=0;bi<4;bi++){
+    oc_fr_state  ft[2];
+    oc_qii_state qt[3];
+    unsigned     best_cost;
+    unsigned     cur_cost;
+    unsigned     cur_ssd;
+    unsigned     cur_rate;
+    unsigned     cur_overhead;
+    int          qii;
+    satd=_frag_satd[bi];
+    *(ft+0)=*&fr;
+    oc_fr_code_block(ft+0);
+    cur_overhead=ft[0].bits-fr.bits;
+    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
+     +(cur_overhead<<OC_BIT_SCALE);
+    if(nqis>1){
+      oc_qii_state_advance(qt+0,&qs,0);
+      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
+    }
+    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
+    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
+    best_fri=0;
+    best_qii=0;
+    for(qii=1;qii<nqis;qii++){
+      oc_qii_state_advance(qt+qii,&qs,qii);
+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
+       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
+      if(cur_cost<best_cost){
+        best_cost=cur_cost;
+        best_ssd=cur_ssd;
+        best_rate=cur_rate;
+        best_qii=qii;
+      }
+    }
+    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
+      *(ft+1)=*&fr;
+      oc_fr_skip_block(ft+1);
+      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
+      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
+      if(cur_cost<=best_cost){
+        best_ssd=cur_ssd;
+        best_rate=cur_overhead;
+        best_fri=1;
+        best_qii+=4;
+      }
+    }
+    rate+=best_rate;
+    ssd+=best_ssd;
+    *&fr=*(ft+best_fri);
+    if(best_fri==0)*&qs=*(qt+best_qii);
+    else nskipped++;
+    _modec->qii[bi]=best_qii;
+  }
+  _modec->ssd=ssd;
+  _modec->rate=rate;
+}
+
+static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ unsigned _rd_scale,int _qti){
+  unsigned ssd;
+  unsigned rate;
+  unsigned satd;
+  unsigned best_ssd;
+  unsigned best_rate;
+  int      best_qii;
+  unsigned cur_cost;
+  unsigned cur_ssd;
+  unsigned cur_rate;
+  int      lambda;
+  int      nblocks;
+  int      nqis;
+  int      pli;
+  int      bi;
+  int      qii;
+  lambda=_enc->lambda;
+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
+     worth spending the bits to change the AC quantizer.
+    TODO: This may be worth revisiting when we separate out DC and AC
+     predictions from SATD.*/
+#if 0
+  nqis=_enc->state.nqis;
+#else
+  nqis=1;
+#endif
+  ssd=_modec->ssd;
+  rate=_modec->rate;
+  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
+     order, we assume a constant overhead for coded block and qii flags.*/
+  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  nblocks=(nblocks-4>>1)+4;
+  bi=4;
+  for(pli=1;pli<3;pli++){
+    for(;bi<nblocks;bi++){
+      unsigned best_cost;
+      satd=_frag_satd[bi];
+      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
+       +OC_CHROMA_QII_RATE;
+      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
+      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
+      best_qii=0;
+      for(qii=1;qii<nqis;qii++){
+        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
+         +OC_CHROMA_QII_RATE;
+        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
+        if(cur_cost<best_cost){
+          best_cost=cur_cost;
+          best_ssd=cur_ssd;
+          best_rate=cur_rate;
+          best_qii=qii;
+        }
+      }
+      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
+        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
+        if(cur_cost<=best_cost){
+          best_ssd=cur_ssd;
+          best_rate=0;
+          best_qii+=4;
+        }
+      }
+      rate+=best_rate;
+      ssd+=best_ssd;
+      _modec->qii[bi]=best_qii;
+    }
+    nblocks=(nblocks-4<<1)+4;
+  }
+  _modec->ssd=ssd;
+  _modec->rate=rate;
+}
+
+static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
+ unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const oc_fragment     *frags;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  oc_mv                 *mvs;
+  int                    map_nidxs;
+  unsigned               uncoded_ssd;
+  int                    mapii;
+  int                    mapi;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    borderi;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
+  ystride=_enc->state.ref_ystride[0];
+  frags=_enc->state.frags;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  mvs=_enc->mb_info[_mbi].block_mv;
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    borderi=frags[fragi].borderi;
+    frag_offs=frag_buf_offs[fragi];
+    if(borderi<0){
+      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
+    }
+    else{
+      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
+    }
+    /*Scale to match DCT domain and RD.*/
+    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
+    /*Motion is a special case; if there is more than a full-pixel motion
+       against the prior frame, penalize skipping.
+      TODO: The factor of two here is a kludge, but it tested out better than a
+       hard limit.*/
+    if(mvs[bi]!=0)uncoded_ssd*=2;
+    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=(map_nidxs-4>>1)+4;
+  mapii=4;
+  mvs=_enc->mb_info[_mbi].unref_mv;
+  for(pli=1;pli<3;pli++){
+    ystride=_enc->state.ref_ystride[pli];
+    for(;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      borderi=frags[fragi].borderi;
+      frag_offs=frag_buf_offs[fragi];
+      if(borderi<0){
+        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
+      }
+      else{
+        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
+      }
+      /*Scale to match DCT domain and RD.*/
+      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
+      /*Motion is a special case; if there is more than a full-pixel motion
+         against the prior frame, penalize skipping.
+        TODO: The factor of two here is a kludge, but it tested out better than
+         a hard limit*/
+      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
+      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
+    }
+    map_nidxs=(map_nidxs-4<<1)+4;
+  }
+}
+
+
+static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[5]){
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   _frag_satd,_skip_ssd,_rd_scale[4],0);
+  _modec->overhead=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,oc_mv _mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
+  unsigned               frag_satd[12];
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    dc;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  _modec->rate=_modec->ssd=0;
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
+    for(bi=0;bi<4;bi++){
+      fragi=sb_map[bi];
+      frag_offs=frag_buf_offs[fragi];
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+        frag_satd[bi]+=abs(dc);
+      }
+      else{
+        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      }
+    }
+  }
+  else{
+    for(bi=0;bi<4;bi++){
+      fragi=sb_map[bi];
+      frag_offs=frag_buf_offs[fragi];
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+        frag_satd[bi]+=abs(dc);
+      }
+      else{
+        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+      }
+    }
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+        frag_satd[mapii]+=abs(dc);
+      }
+      else{
+        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+      }
+    }
+  }
+  else{
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
+        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+        frag_satd[mapii]+=abs(dc);
+      }
+      else{
+        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
+         ref+frag_offs+mv_offs[0],ystride);
+      }
+    }
+  }
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   frag_satd,_skip_ssd,_rd_scale[4],1);
+  _modec->overhead=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
+}
+
+static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,oc_mv _mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
+ const unsigned _rd_scale[4]){
+  int bits0;
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
+  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
+  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+  return bits0;
+}
+
+/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
+static const unsigned char OC_MB_PHASE[4][4]={
+  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
+};
+
+static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
+  unsigned               frag_satd[12];
+  oc_mv                  lbmvs[4];
+  oc_mv                  cbmvs[4];
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  oc_mv                 *frag_mvs;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    nqis;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    bits0;
+  int                    bits1;
+  unsigned               satd;
+  int                    dc;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  frag_mvs=_enc->state.frag_mvs;
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  _modec->rate=_modec->ssd=0;
+  for(bi=0;bi<4;bi++){
+    fragi=mb_map[0][bi];
+    /*Save the block MVs as the current ones while we're here; we'll replace
+       them if we don't ultimately choose 4MV mode.*/
+    frag_mvs[fragi]=_mv[bi];
+    frag_offs=frag_buf_offs[fragi];
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+    }
+    else{
+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
+    }
+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
+  }
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
+   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
+  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
+  bits0=0;
+  bits1=0;
+  nqis=_enc->state.nqis;
+  for(bi=0;bi<4;bi++){
+    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
+    else{
+      lbmvs[bi]=_mv[bi];
+      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
+       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
+      bits1+=12;
+    }
+  }
+  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    /*TODO: We could save half these calls by re-using the results for the Cb
+       and Cr planes; is it worth it?*/
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
+    }
+    else{
+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride);
+    }
+    frag_satd[mapii]=satd+abs(dc);
+  }
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
+   frag_satd,_skip_ssd,_rd_scale[4],1);
+  _modec->overhead=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
+   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
+  oc_set_chroma_mvs_func  set_chroma_mvs;
+  oc_qii_state            intra_luma_qs;
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
+  ogg_int64_t             interbits;
+  ogg_int64_t             intrabits;
+  ogg_int64_t             activity_sum;
+  ogg_int64_t             luma_sum;
+  unsigned                activity_avg;
+  unsigned                luma_avg;
+  const ogg_uint16_t     *chroma_rd_scale;
+  ogg_uint16_t           *mcu_rd_scale;
+  ogg_uint16_t           *mcu_rd_iscale;
+  const unsigned char    *map_idxs;
+  int                     nmap_idxs;
+  unsigned               *coded_mbis;
+  unsigned               *uncoded_mbis;
+  size_t                  ncoded_mbis;
+  size_t                  nuncoded_mbis;
+  oc_sb_flags            *sb_flags;
+  signed char            *mb_modes;
+  const oc_sb_map        *sb_maps;
+  const oc_mb_map        *mb_maps;
+  oc_mb_enc_info         *embs;
+  oc_fragment            *frags;
+  oc_mv                  *frag_mvs;
+  unsigned                stripe_sby;
+  unsigned                mcu_nvsbs;
+  int                     notstart;
+  int                     notdone;
+  unsigned                sbi;
+  unsigned                sbi_end;
+  int                     refi;
+  int                     pli;
+  int                     sp_level;
+  sp_level=_enc->sp_level;
+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
+  _enc->state.frame_type=OC_INTER_FRAME;
+  oc_mode_scheme_chooser_reset(&_enc->chooser);
+  oc_enc_tokenize_start(_enc);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
+  oc_enc_mode_rd_init(_enc);
+  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
+  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
+  interbits=intrabits=0;
+  activity_sum=luma_sum=0;
+  activity_avg=_enc->activity_avg;
+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
+  mcu_rd_scale=_enc->mcu_rd_scale;
+  mcu_rd_iscale=_enc->mcu_rd_iscale;
+  last_mv=prior_mv=0;
+  /*Choose MVs and MB modes and quantize and code luma.
+    Must be done in Hilbert order.*/
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  coded_mbis=_enc->coded_mbis;
+  uncoded_mbis=coded_mbis+_enc->state.nmbs;
+  ncoded_mbis=0;
+  nuncoded_mbis=0;
+  _enc->state.ncoded_fragis[0]=0;
+  _enc->state.ncoded_fragis[1]=0;
+  _enc->state.ncoded_fragis[2]=0;
+  sb_flags=_enc->state.sb_flags;
+  mb_modes=_enc->state.mb_modes;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  embs=_enc->mb_info;
+  frags=_enc->state.frags;
+  frag_mvs=_enc->state.frag_mvs;
+  notstart=0;
+  notdone=1;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
+    ptrdiff_t cfroffset;
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
+      int quadi;
+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        oc_mode_choice modes[8];
+        unsigned       activity[4];
+        unsigned       rd_scale[5];
+        unsigned       rd_iscale[5];
+        unsigned       skip_ssd[12];
+        unsigned       intra_satd[12];
+        unsigned       luma;
+        int            mb_mv_bits_0;
+        int            mb_gmv_bits_0;
+        int            inter_mv_pref;
+        int            mb_mode;
+        int            refi;
+        int            mv;
+        unsigned       mbi;
+        int            mapii;
+        int            mapi;
+        int            bi;
+        ptrdiff_t      fragi;
+        mbi=sbi<<2|quadi;
+        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+        /*Activity masking.*/
+        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          oc_mb_activity(_enc,mbi,activity);
+        }
+        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+        luma_sum+=luma;
+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
+        /*Motion estimation:
+          We always do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not.*/
+        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
+        mv=0;
+        /*Find the block choice with the lowest estimated coding cost.
+          If a Cb or Cr block is coded but no Y' block from a macro block then
+           the mode MUST be OC_MODE_INTER_NOMV.
+          This is the default state to which the mode data structure is
+           initialised in encoder and decoder at the start of each frame.*/
+        /*Block coding cost is estimated from correlated SATD metrics.*/
+        /*At this point, all blocks that are in frame are still marked coded.*/
+        if(!_recode){
+          embs[mbi].unref_mv[OC_FRAME_GOLD]=
+           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
+          embs[mbi].unref_mv[OC_FRAME_PREV]=
+           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
+          embs[mbi].refined=0;
+        }
+        /*Estimate the cost of coding this MB in a keyframe.*/
+        if(_allow_keyframe){
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
+          intrabits+=modes[OC_MODE_INTRA].rate;
+          for(bi=0;bi<4;bi++){
+            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
+             modes[OC_MODE_INTRA].qii[bi]);
+          }
+        }
+        /*Estimate the cost in a delta frame for various modes.*/
+        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
+        if(sp_level<OC_SP_LEVEL_NOMC){
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
+           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
+           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
+          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+             refinement.
+            We choose the explicit MV mode that's already furthest ahead on
+             R-D cost and refine only that one.
+            We have to be careful to remember which ones we've refined so that
+             we don't refine it again if we re-encode this frame.*/
+          inter_mv_pref=_enc->lambda*3;
+          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
+          }
+          else{
+            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
+          }
+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
+           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+            if(!(embs[mbi].refined&0x80)){
+              oc_mcenc_refine4mv(_enc,mbi);
+              embs[mbi].refined|=0x80;
+            }
+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
+          }
+          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
+           modes[OC_MODE_INTER_MV].cost){
+            if(!(embs[mbi].refined&0x40)){
+              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
+              embs[mbi].refined|=0x40;
+            }
+            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
+             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
+          }
+          if(!(embs[mbi].refined&0x04)){
+            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
+            embs[mbi].refined|=0x04;
+          }
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
+          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
+          mb_mode=OC_MODE_INTER_NOMV;
+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
+            mb_mode=OC_MODE_INTRA;
+          }
+          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_LAST;
+          }
+          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_LAST2;
+          }
+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_NOMV;
+          }
+          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_MV;
+          }
+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_FOUR;
+          }
+          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
+          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
+            inter_mv_pref=0;
+          }
+          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
+            mb_mode=OC_MODE_INTER_MV;
+          }
+        }
+        else{
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
+          mb_mode=OC_MODE_INTER_NOMV;
+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
+            mb_mode=OC_MODE_INTRA;
+          }
+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_NOMV;
+          }
+          mb_mv_bits_0=mb_gmv_bits_0=0;
+        }
+        mb_modes[mbi]=mb_mode;
+        /*Propagate the MVs to the luma blocks.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
+            }break;
+            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
+            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
+            case OC_MODE_GOLDEN_MV:{
+              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
+            }break;
+          }
+          for(bi=0;bi<4;bi++){
+            fragi=mb_maps[mbi][0][bi];
+            frag_mvs[fragi]=mv;
+          }
+        }
+        for(bi=0;bi<4;bi++){
+          fragi=sb_maps[mbi>>2][mbi&3][bi];
+          frags[fragi].qii=modes[mb_mode].qii[bi];
+        }
+        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
+         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
+          int orig_mb_mode;
+          orig_mb_mode=mb_mode;
+          mb_mode=mb_modes[mbi];
+          refi=OC_FRAME_FOR_MODE(mb_mode);
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              prior_mv=last_mv;
+              /*If we're backing out from 4MV, find the MV we're actually
+                 using.*/
+              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
+                for(bi=0;;bi++){
+                  fragi=mb_maps[mbi][0][bi];
+                  if(frags[fragi].coded){
+                    mv=last_mv=frag_mvs[fragi];
+                    break;
+                  }
+                }
+                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
+                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
+              }
+              /*Otherwise we used the original analysis MV.*/
+              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
+              _enc->mv_bits[0]+=mb_mv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              oc_mv tmp_mv;
+              tmp_mv=prior_mv;
+              prior_mv=last_mv;
+              last_mv=tmp_mv;
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              _enc->mv_bits[0]+=mb_gmv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_FOUR:{
+              oc_mv lbmvs[4];
+              oc_mv cbmvs[4];
+              prior_mv=last_mv;
+              for(bi=0;bi<4;bi++){
+                fragi=mb_maps[mbi][0][bi];
+                if(frags[fragi].coded){
+                  lbmvs[bi]=last_mv=frag_mvs[fragi];
+                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
+                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
+                  _enc->mv_bits[1]+=12;
+                }
+                /*Replace the block MVs for not-coded blocks with (0,0).*/
+                else lbmvs[bi]=0;
+              }
+              (*set_chroma_mvs)(cbmvs,lbmvs);
+              for(mapii=4;mapii<nmap_idxs;mapii++){
+                mapi=map_idxs[mapii];
+                pli=mapi>>2;
+                bi=mapi&3;
+                fragi=mb_maps[mbi][pli][bi];
+                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
+                frags[fragi].refi=refi;
+                frags[fragi].mb_mode=mb_mode;
+                frag_mvs[fragi]=cbmvs[bi];
+              }
+            }break;
+          }
+          coded_mbis[ncoded_mbis++]=mbi;
+          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
+          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
+        }
+        else{
+          *(uncoded_mbis-++nuncoded_mbis)=mbi;
+          mb_mode=OC_MODE_INTER_NOMV;
+          refi=OC_FRAME_PREV;
+          mv=0;
+        }
+        /*Propagate final MB mode and MVs to the chroma blocks.
+          This has already been done for 4MV mode, since it requires individual
+           block motion vectors.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          for(mapii=4;mapii<nmap_idxs;mapii++){
+            mapi=map_idxs[mapii];
+            pli=mapi>>2;
+            bi=mapi&3;
+            fragi=mb_maps[mbi][pli][bi];
+            /*If we switched from 4MV mode to INTER_MV mode, then the qii
+               values won't have been chosen with the right MV, but it's
+               probaby not worth re-estimating them.*/
+            frags[fragi].qii=modes[mb_mode].qii[mapii];
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mv;
+          }
+        }
+        /*Save masking scale factors for chroma blocks.*/
+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][1][bi];
+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
+        }
+      }
+      oc_fr_state_flush_sb(_enc->pipe.fr+0);
+      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
+      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
+    }
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
+    /*Code chroma planes.*/
+    for(pli=1;pli<3;pli++){
+      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
+    }
+    notstart=1;
+  }
+  /*Update the average block activity and MB luma score for the frame.
+    We could use a Bessel follower here, but fast reaction is probably almost
+     always best.*/
+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
+   _enc->state.fplanes[0].nfrags));
+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
+  /*Finish filling in the reference frame borders.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  /*Finish adding flagging overhead costs to inter bit counts to determine if
+     we should have coded a key frame instead.*/
+  if(_allow_keyframe){
+    /*Technically the chroma plane counts are over-estimations, because they
+       don't account for continuing runs from the luma planes, but the
+       inaccuracy is small.
+      We don't need to add the luma plane coding flag costs, because they are
+       already included in the MB rate estimates.*/
+    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
+    if(interbits>intrabits)return 1;
+  }
+  _enc->ncoded_mbis=ncoded_mbis;
+  /*Compact the coded fragment list.*/
+  {
+    ptrdiff_t ncoded_fragis;
+    ncoded_fragis=_enc->state.ncoded_fragis[0];
+    for(pli=1;pli<3;pli++){
+      memmove(_enc->state.coded_fragis+ncoded_fragis,
+       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
+       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
+      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    }
+    _enc->state.ntotal_coded_fragis=ncoded_fragis;
+  }
+  return 0;
+}

+ 166 - 0
modules/theoraplayer/native/theora/lib/apiwrapper.c

@@ -0,0 +1,166 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+
+
+
+const char *theora_version_string(void){
+  return th_version_string();
+}
+
+ogg_uint32_t theora_version_number(void){
+  return th_version_number();
+}
+
+void theora_info_init(theora_info *_ci){
+  memset(_ci,0,sizeof(*_ci));
+}
+
+void theora_info_clear(theora_info *_ci){
+  th_api_wrapper *api;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  memset(_ci,0,sizeof(*_ci));
+  if(api!=NULL){
+    if(api->clear!=NULL)(*api->clear)(api);
+    _ogg_free(api);
+  }
+}
+
+void theora_clear(theora_state *_th){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    (*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th);
+  }
+  if(_th->internal_encode!=NULL){
+    (*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th);
+  }
+  if(_th->i!=NULL)theora_info_clear(_th->i);
+  memset(_th,0,sizeof(*_th));
+}
+
+int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th,
+     _req,_buf,_buf_sz);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th,
+     _req,_buf,_buf_sz);
+  }
+  else return TH_EINVAL;
+}
+
+ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)(
+     _th,_gp);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)(
+     _th,_gp);
+  }
+  else return -1;
+}
+
+double theora_granule_time(theora_state *_th, ogg_int64_t _gp){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)(
+     _th,_gp);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)(
+     _th,_gp);
+  }
+  else return -1;
+}
+
+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){
+  _info->version_major=_ci->version_major;
+  _info->version_minor=_ci->version_minor;
+  _info->version_subminor=_ci->version_subminor;
+  _info->frame_width=_ci->width;
+  _info->frame_height=_ci->height;
+  _info->pic_width=_ci->frame_width;
+  _info->pic_height=_ci->frame_height;
+  _info->pic_x=_ci->offset_x;
+  _info->pic_y=_ci->offset_y;
+  _info->fps_numerator=_ci->fps_numerator;
+  _info->fps_denominator=_ci->fps_denominator;
+  _info->aspect_numerator=_ci->aspect_numerator;
+  _info->aspect_denominator=_ci->aspect_denominator;
+  switch(_ci->colorspace){
+    case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break;
+    case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break;
+    default:_info->colorspace=TH_CS_UNSPECIFIED;break;
+  }
+  switch(_ci->pixelformat){
+    case OC_PF_420:_info->pixel_fmt=TH_PF_420;break;
+    case OC_PF_422:_info->pixel_fmt=TH_PF_422;break;
+    case OC_PF_444:_info->pixel_fmt=TH_PF_444;break;
+    default:_info->pixel_fmt=TH_PF_RSVD;
+  }
+  _info->target_bitrate=_ci->target_bitrate;
+  _info->quality=_ci->quality;
+  _info->keyframe_granule_shift=_ci->keyframe_frequency_force>0?
+   OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0;
+}
+
+int theora_packet_isheader(ogg_packet *_op){
+  return th_packet_isheader(_op);
+}
+
+int theora_packet_iskeyframe(ogg_packet *_op){
+  return th_packet_iskeyframe(_op);
+}
+
+int theora_granule_shift(theora_info *_ci){
+  /*This breaks when keyframe_frequency_force is not positive or is larger than
+     2**31 (if your int is more than 32 bits), but that's what the original
+     function does.*/
+  return oc_ilog(_ci->keyframe_frequency_force-1);
+}
+
+void theora_comment_init(theora_comment *_tc){
+  th_comment_init((th_comment *)_tc);
+}
+
+char *theora_comment_query(theora_comment *_tc,char *_tag,int _count){
+  return th_comment_query((th_comment *)_tc,_tag,_count);
+}
+
+int theora_comment_query_count(theora_comment *_tc,char *_tag){
+  return th_comment_query_count((th_comment *)_tc,_tag);
+}
+
+void theora_comment_clear(theora_comment *_tc){
+  th_comment_clear((th_comment *)_tc);
+}
+
+void theora_comment_add(theora_comment *_tc,char *_comment){
+  th_comment_add((th_comment *)_tc,_comment);
+}
+
+void theora_comment_add_tag(theora_comment *_tc, char *_tag, char *_value){
+  th_comment_add_tag((th_comment *)_tc,_tag,_value);
+}

+ 54 - 0
modules/theoraplayer/native/theora/lib/apiwrapper.h

@@ -0,0 +1,54 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: apiwrapper.h 13596 2007-08-23 20:05:38Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_apiwrapper_H)
+# define _apiwrapper_H (1)
+# include <ogg/ogg.h>
+# include <theora/theora.h>
+# include "theora/theoradec.h"
+# include "theora/theoraenc.h"
+# include "state.h"
+
+typedef struct th_api_wrapper th_api_wrapper;
+typedef struct th_api_info    th_api_info;
+
+/*Provide an entry point for the codec setup to clear itself in case we ever
+   want to break pieces off into a common base library shared by encoder and
+   decoder.
+  In addition, this makes several other pieces of the API wrapper cleaner.*/
+typedef void (*oc_setup_clear_func)(void *_ts);
+
+/*Generally only one of these pointers will be non-NULL in any given instance.
+  Technically we do not even really need this struct, since we should be able
+   to figure out which one from "context", but doing it this way makes sure we
+   don't flub it up.*/
+struct th_api_wrapper{
+  oc_setup_clear_func  clear;
+  th_setup_info       *setup;
+  th_dec_ctx          *decode;
+  th_enc_ctx          *encode;
+};
+
+struct th_api_info{
+  th_api_wrapper api;
+  theora_info    info;
+};
+
+
+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci);
+
+#endif

+ 304 - 0
modules/theoraplayer/native/theora/lib/arm/arm2gnu.pl

@@ -0,0 +1,304 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+my $nxstack;
+
+$nxstack = 0;
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+@proc_stack = ();
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        my $align;
+        $align = "2";
+        if ( /ALIGN=(\d+)/ ) {
+            $align = $1;
+        }
+        if ( /CODE/ ) {
+            $nxstack = 1;
+        }
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+        s/$/;   .p2align $align/;
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        my $prefix;
+        my $proc;
+        /^([A-Za-z_\.]\w+)\b/;
+        $proc = $1;
+        $prefix = "";
+        if ($proc)
+        {
+            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
+            push(@proc_stack, $proc);
+            s/^[A-Za-z_\.]\w+/$&:/;
+        }
+        $prefix = $prefix."\t.thumb_func; " if ($thumb);
+        s/\bPROC\b/@ $&/;
+        $_ = $prefix.$_;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $prefix;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+            $prefix = "\t.byte\t0x".$w1.";".
+                      "\t.byte\t0x".$w2.";".
+                      "\t.byte\t0x".$w3.";".
+                      "\t.byte\t0x".$w4."; ";
+        }
+        else
+        {
+            # little endian
+            $prefix = "\t.byte\t0x".$w4.";".
+                      "\t.byte\t0x".$w3.";".
+                      "\t.byte\t0x".$w2.";".
+                      "\t.byte\t0x".$w1."; ";
+        }
+        $_=$prefix.$_;
+    }
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+#If we had a code section, mark that this object doesn't need an executable
+# stack.
+if ($nxstack) {
+    printf ("    .section\t.note.GNU-stack,\"\",\%\%progbits\n");
+}

+ 231 - 0
modules/theoraplayer/native/theora/lib/arm/armbits.asm

@@ -0,0 +1,231 @@
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@
+@ function:
+@   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
+@
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.global oc_pack_read_arm
+	.global oc_pack_read1_arm
+	.global oc_huff_token_decode_arm
+
+	.type	oc_pack_read1_arm, %function; oc_pack_read1_arm: @ PROC
+	@ r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      @ r2 = window
+	@ Stall...             ; r3 = available
+	@ Stall...
+	SUBS r3,r3,#1          @ r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      @ r0 = window>>31
+	MOV r2,r2,LSL #1       @ r2 = window<<=1
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	MOV PC,r14
+	.size oc_pack_read1_arm, .-oc_pack_read1_arm	@ ENDP
+
+	.type	oc_pack_read_arm, %function; oc_pack_read_arm: @ PROC
+	@ r0 = oc_pack_buf *_b
+	@ r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      @ r2 = window
+	@ Stall...             ; r3 = available
+	@ Stall...
+	SUBS r3,r3,r1          @ r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          @ r0 = 32-_bits
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	MOV PC,r14
+
+@ We need to refill window.
+oc_pack_read1_refill:
+	MOV r1,#1
+oc_pack_read_refill:
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     @ r10 = stop
+	                       @ r11 = ptr
+	RSB r0,r1,#32          @ r0 = 32-_bits
+	RSB r3,r3,r0           @ r3 = 32-available
+@ We can use unsigned compares for both the pointers and for available
+@  (allowing us to chain condition codes) because available will never be
+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+@  negative.
+	CMP r10,r11            @ ptr<stop => HI
+	CMPHI r3,#7            @   available<=24 => HI
+	LDRHIB r14,[r11],#1    @     r14 = *ptr++
+	SUBHI r3,#8            @     available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @     r2 = window|=r14<<32-available
+	CMPHI r10,r11          @     ptr<stop => HI
+	CMPHI r3,#7            @       available<=24 => HI
+	LDRHIB r14,[r11],#1    @         r14 = *ptr++
+	SUBHI r3,#8            @         available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @         r2 = window|=r14<<32-available
+	CMPHI r10,r11          @         ptr<stop => HI
+	CMPHI r3,#7            @           available<=24 => HI
+	LDRHIB r14,[r11],#1    @             r14 = *ptr++
+	SUBHI r3,#8            @             available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @             r2 = window|=r14<<32-available
+	CMPHI r10,r11          @             ptr<stop => HI
+	CMPHI r3,#7            @               available<=24 => HI
+	LDRHIB r14,[r11],#1    @                 r14 = *ptr++
+	SUBHI r3,#8            @                 available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          @ r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STR r11,[r12,#-4]      @ ptr = r11
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+@ Either we wanted to read more than 24 bits and didn't have enough room to
+@  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last:
+	CMP r11,r10            @ ptr<stop => LO
+@ If we didn't hit the end of the packet, then pull enough of the next byte to
+@  to fill up the window.
+	LDRLOB r14,[r11]       @ (LO) r14 = *ptr
+@ Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           @ (HS) r14 = 1
+	ADDLO r10,r3,r1        @ (LO) r10 = available
+	STRHS r14,[r12,#8]     @ (HS) eof = 1
+	ANDLO r10,r10,#7       @ (LO) r10 = available0x7
+	MOVHS r3,#1<<30        @ (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   @ (LO) r2 = window|=*ptr>>(available0x7)
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STR r11,[r12,#-4]      @ ptr = r11
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	LDMFD r13!,{r10,r11,PC}
+	.size oc_pack_read_arm, .-oc_pack_read_arm	@ ENDP
+
+
+
+	.type	oc_huff_token_decode_arm, %function; oc_huff_token_decode_arm: @ PROC
+	@ r0 = oc_pack_buf       *_b
+	@ r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         @ r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       @ r2 = stop
+	@ Stall...             ; r3 = ptr
+	@ Stall...             ; r4 = window
+	                       @ r5 = available
+	CMP r10,r5             @ n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  @ r14 = _tree+bits
+	LDRSH r12,[r14,#2]     @ r12 = node=_tree[1+bits]
+	@ Stall...
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+@ The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue:
+	ADD r12,r1,r12,LSL #1  @ r12 = _tree+node
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r5,r10          @ r5 = available-=n
+	LDRSH r10,[r12],#2     @ r10 = n=_tree[node]
+	@ Stall...             ; r12 = _tree+node+1
+	@ Stall...
+	CMP r10,r5             @ n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r12,r12,r14        @
+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
+	@ Stall...
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0:
+	ADD r12,r1,#2          @ r12 = _tree+1
+oc_huff_token_decode_refill:
+@ We can't possibly need more than 15 bits, so available must be <= 15.
+@ Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              @ ptr<stop => HI
+	LDRHIB r14,[r3],#1     @   r14 = *ptr++
+	RSBHI r5,r5,#24        @ (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        @ (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    @   r4 = window|=r14<<32-available
+	CMPHI r2,r3            @   ptr<stop => HI
+	LDRHIB r14,[r3],#1     @     r14 = *ptr++
+	SUBHI r5,#8            @     available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @     r4 = window|=r14<<32-available
+@ We can use unsigned compares for both the pointers and for available
+@  (allowing us to chain condition codes) because available will never be
+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+@  negative.
+	CMPHI r2,r3            @     ptr<stop => HI
+	CMPHI r5,#7            @       available<=24 => HI
+	LDRHIB r14,[r3],#1     @         r14 = *ptr++
+	SUBHI r5,#8            @         available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @         r4 = window|=r14<<32-available
+	CMP r2,r3              @ ptr<stop => HI
+	MOVLS r5,#-1<<30       @ (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            @ (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     @ (HI)   r14 = *ptr++
+	SUBHI r5,#8            @ (HI)   available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @ (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r12,r12,r14        @
+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          @ r5 = available
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+	.size oc_huff_token_decode_arm, .-oc_huff_token_decode_arm	@ ENDP
+
+	@ END
+    .section	.note.GNU-stack,"",%progbits

+ 32 - 0
modules/theoraplayer/native/theora/lib/arm/armbits.h

@@ -0,0 +1,32 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_pack_read oc_pack_read_arm
+#  define oc_pack_read1 oc_pack_read1_arm
+#  define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif

+ 230 - 0
modules/theoraplayer/native/theora/lib/arm/armbits.s

@@ -0,0 +1,230 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	EXPORT oc_pack_read_arm
+	EXPORT oc_pack_read1_arm
+	EXPORT oc_huff_token_decode_arm
+
+oc_pack_read1_arm PROC
+	; r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+	ENDP
+
+oc_pack_read_arm PROC
+	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+	MOV r1,#1
+oc_pack_read_refill
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     ; r10 = stop
+	                       ; r11 = ptr
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	RSB r3,r3,r0           ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMP r10,r11            ; ptr<stop => HI
+	CMPHI r3,#7            ;   available<=24 => HI
+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
+	SUBHI r3,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;     ptr<stop => HI
+	CMPHI r3,#7            ;       available<=24 => HI
+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
+	SUBHI r3,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;         ptr<stop => HI
+	CMPHI r3,#7            ;           available<=24 => HI
+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
+	SUBHI r3,#8            ;             available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;             ptr<stop => HI
+	CMPHI r3,#7            ;               available<=24 => HI
+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
+	SUBHI r3,#8            ;                 available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+;  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+	CMP r11,r10            ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+;  to fill up the window.
+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           ; (HS) r14 = 1
+	ADDLO r10,r3,r1        ; (LO) r10 = available
+	STRHS r14,[r12,#8]     ; (HS) eof = 1
+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+	ENDP
+
+
+
+oc_huff_token_decode_arm PROC
+	; r0 = oc_pack_buf       *_b
+	; r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       ; r2 = stop
+	; Stall...             ; r3 = ptr
+	; Stall...             ; r4 = window
+	                       ; r5 = available
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r5,r10          ; r5 = available-=n
+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
+	; Stall...             ; r12 = _tree+node+1
+	; Stall...
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+	ADD r12,r1,#2          ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              ; ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
+	CMPHI r2,r3            ;   ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
+	SUBHI r5,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMPHI r2,r3            ;     ptr<stop => HI
+	CMPHI r5,#7            ;       available<=24 => HI
+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
+	SUBHI r5,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
+	CMP r2,r3              ; ptr<stop => HI
+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            ; (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
+	SUBHI r5,#8            ; (HI)   available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          ; r5 = available
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+	ENDP
+
+	END

+ 116 - 0
modules/theoraplayer/native/theora/lib/arm/armcpu.c

@@ -0,0 +1,116 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_MEDIA)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  flags=0;
+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
+     instructions via their assembled hex code.
+    All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OC_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OC_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OC_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t  flags;
+  FILE         *fin;
+  flags=0;
+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+     Android.
+    This also means that detection will fail in Scratchbox.*/
+  fin=fopen("/proc/cpuinfo","r");
+  if(fin!=NULL){
+    /*512 should be enough for anybody (it's even enough for all the flags that
+       x86 has accumulated... so far).*/
+    char buf[512];
+    while(fgets(buf,511,fin)!=NULL){
+      if(memcmp(buf,"Features",8)==0){
+        char *p;
+        p=strstr(buf," edsp");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+        p=strstr(buf," neon");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+      }
+      if(memcmp(buf,"CPU architecture:",17)==0){
+        int version;
+        version=atoi(buf+17);
+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+   accessible in priveleged modes only, so we can't have a general user-space
+   detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform.  Reconfigure with --disable-asm (or send patches)."
+#endif

+ 29 - 0
modules/theoraplayer/native/theora/lib/arm/armcpu.h

@@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA    (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP     (1<<7)
+#define OC_CPU_ARM_NEON     (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

+ 57 - 0
modules/theoraplayer/native/theora/lib/arm/armenc.c

@@ -0,0 +1,57 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armenc.h"
+
+#if defined(OC_ARM_ASM)
+
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
+  /*TODO: Add ARMv4 functions here.*/
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add EDSP functions here.*/
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add Media functions here.*/
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
+#    endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+  }
+#   endif
+#  endif
+# endif
+}
+#endif

+ 51 - 0
modules/theoraplayer/native/theora/lib/arm/armenc.h

@@ -0,0 +1,51 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armenc_H)
+# define _arm_armenc_H (1)
+# include "armint.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_arm
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+
+# include "../encint.h"
+
+# if defined(OC_ARM_ASM)
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
+
+#  if defined(OC_ARM_ASM_EDSP)
+#   if defined(OC_ARM_ASM_MEDIA)
+#    if defined(OC_ARM_ASM_NEON)
+unsigned oc_enc_frag_satd_neon(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_neon(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_neon(int *_dc,
+ const unsigned char *_src,int _ystride);
+
+void oc_enc_enquant_table_init_neon(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

+ 220 - 0
modules/theoraplayer/native/theora/lib/arm/armencfrag.s

@@ -0,0 +1,220 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_enc_frag_satd_neon
+	EXPORT	oc_enc_frag_satd2_neon
+	EXPORT	oc_enc_frag_intra_satd_neon
+
+oc_enc_frag_satd_neon PROC
+	; r0 = int                 *_dc
+	; r1 = const unsigned char *_src
+	; r2 = const unsigned char *_ref
+	; r3 = int                  _ystride
+	; Load src and subtract ref, expanding to 16 bits.
+	VLD1.64		{D16},[r1@64],r3
+	VLD1.64		{D0}, [r2],r3
+	VSUBL.U8	Q8, D16,D0
+	VLD1.64		{D18},[r1@64],r3
+	VLD1.64		{D1}, [r2],r3
+	VSUBL.U8	Q9, D18,D1
+	VLD1.64		{D20},[r1@64],r3
+	VLD1.64		{D2}, [r2],r3
+	VSUBL.U8	Q10,D20,D2
+	VLD1.64		{D22},[r1@64],r3
+	VLD1.64		{D3}, [r2],r3
+	VSUBL.U8	Q11,D22,D3
+	VLD1.64		{D24},[r1@64],r3
+	VLD1.64		{D4}, [r2],r3
+	VSUBL.U8	Q12,D24,D4
+	VLD1.64		{D26},[r1@64],r3
+	VLD1.64		{D5}, [r2],r3
+	VSUBL.U8	Q13,D26,D5
+	VLD1.64		{D28},[r1@64],r3
+	VLD1.64		{D6}, [r2],r3
+	VSUBL.U8	Q14,D28,D6
+	VLD1.64		{D30},[r1@64]
+	VLD1.64		{D7}, [r2]
+	VSUBL.U8	Q15,D30,D7
+oc_int_frag_satd_neon
+	; Hadamard Stage A
+	VADD.I16	Q0, Q8, Q12
+	VSUB.I16	Q12,Q8, Q12
+	VSUB.I16	Q1, Q9, Q13
+	VADD.I16	Q9, Q9, Q13
+	VSUB.I16	Q2, Q10,Q14
+	VADD.I16	Q10,Q10,Q14
+	VADD.I16	Q3, Q11,Q15
+	VSUB.I16	Q15,Q11,Q15
+	; Hadamard Stage B
+	VADD.I16	Q8, Q0, Q10
+	VSUB.I16	Q0, Q0, Q10
+	VSUB.I16	Q11,Q9, Q3
+	VADD.I16	Q3, Q9, Q3
+	VSUB.I16	Q14,Q12,Q2
+	VADD.I16	Q2, Q12,Q2
+	VADD.I16	Q13,Q1, Q15
+	VSUB.I16	Q1, Q1, Q15
+	; Hadamard Stage C & Start 8x8 Transpose
+	VSUB.I16	Q9, Q8, Q3
+	VADD.I16	Q8, Q8, Q3
+	VTRN.16		Q8, Q9
+	VADD.I16	Q10,Q0, Q11
+	VSUB.I16	Q11,Q0, Q11
+	VTRN.16		Q10,Q11
+	VADD.I16	Q12,Q2, Q13
+	VTRN.32		Q8, Q10
+	VSUB.I16	Q13,Q2, Q13
+	VTRN.32		Q9, Q11
+	VSUB.I16	Q15,Q14,Q1
+	VTRN.16		Q12,Q13
+	VADD.I16	Q14,Q14,Q1
+	VTRN.16		Q14,Q15
+	VTRN.32		Q12,Q14
+	VSWP		D17,D24
+	; Hadamard Stage A & Finish 8x8 Transpose
+	VADD.I16	Q0, Q8, Q12
+	VTRN.32		Q13,Q15
+	VSUB.I16	Q12,Q8, Q12
+	VSWP		D19,D26
+	VSUB.I16	Q1, Q9, Q13
+	VSWP		D21,D28
+	VADD.I16	Q9, Q9, Q13
+	VSWP		D23,D30
+	VSUB.I16	Q2, Q10,Q14
+	VADD.I16	Q10,Q10,Q14
+	VADD.I16	Q3, Q11,Q15
+	VSUB.I16	Q15,Q11,Q15
+	; Hadamard Stage B
+	VADD.I16	Q8, Q0, Q10
+	VSUB.I16	Q0, Q0, Q10
+	VSUB.I16	Q11,Q9, Q3
+	VADD.I16	Q3, Q9, Q3
+	VSUB.I16	Q14,Q12,Q2
+	VADD.I16	Q2, Q12,Q2
+	VADD.I16	Q13,Q1, Q15
+	VSUB.I16	Q1, Q1, Q15
+	; Hadamard Stage C & abs & accum
+	VNEG.S16	Q9, Q3
+	; Compute the (signed) DC component and save it off.
+	VADDL.S16	Q10,D16,D6
+	VABD.S16	Q12,Q8, Q9
+	VABD.S16	Q15,Q11,Q0
+	VST1.32		D20[0],[r0]
+	; Remove the (abs) DC component from the total.
+	MOV	r3,#0
+	VMOV.I16	D24[0],r3
+	VABA.S16	Q12,Q13,Q2
+	VABA.S16	Q15,Q14,Q1
+	VNEG.S16	Q0, Q0
+	VNEG.S16	Q2, Q2
+	VNEG.S16	Q1, Q1
+	VABA.S16	Q12,Q8, Q3
+	VABA.S16	Q15,Q11,Q0
+	VABA.S16	Q12,Q13,Q2
+	VABA.S16	Q15,Q14,Q1
+	; We're now using all 16 bits of each value.
+	VPADDL.U16	Q12,Q12
+	VPADAL.U16	Q12,Q15
+	VADD.U32	D24,D24,D25
+	VPADDL.U32	D24,D24
+	VMOV.U32	r0, D24[0]
+	MOV	PC, r14
+	ENDP
+
+oc_enc_frag_satd2_neon PROC
+	; r0 = int                 *_dc
+	; r1 = const unsigned char *_src
+	; r2 = const unsigned char *_ref1
+	; r3 = const unsigned char *_ref2
+	; r12= int                  _ystride
+	LDR	r12,[r13]
+	; Load src and subtract (ref1+ref2>>1), expanding to 16 bits.
+	VLD1.64		{D0}, [r2],r12
+	VLD1.64		{D1}, [r3],r12
+	VLD1.64		{D16},[r1@64],r12
+	VHADD.U8	D0, D0, D1
+	VLD1.64		{D2}, [r2],r12
+	VLD1.64		{D3}, [r3],r12
+	VSUBL.U8	Q8, D16,D0
+	VLD1.64		{D18},[r1@64],r12
+	VHADD.U8	D2, D2, D3
+	VLD1.64		{D4}, [r2],r12
+	VLD1.64		{D5}, [r3],r12
+	VSUBL.U8	Q9, D18,D2
+	VLD1.64		{D20},[r1@64],r12
+	VHADD.U8	D4, D4, D5
+	VLD1.64		{D6}, [r2],r12
+	VLD1.64		{D7}, [r3],r12
+	VSUBL.U8	Q10,D20,D4
+	VLD1.64		{D22},[r1@64],r12
+	VHADD.U8	D6, D6, D7
+	VLD1.64		{D0}, [r2],r12
+	VLD1.64		{D1}, [r3],r12
+	VSUBL.U8	Q11,D22,D6
+	VLD1.64		{D24},[r1@64],r12
+	VHADD.U8	D0, D0, D1
+	VLD1.64		{D2}, [r2],r12
+	VLD1.64		{D3}, [r3],r12
+	VSUBL.U8	Q12,D24,D0
+	VLD1.64		{D26},[r1@64],r12
+	VHADD.U8	D2, D2, D3
+	VLD1.64		{D4}, [r2],r12
+	VLD1.64		{D5}, [r3],r12
+	VSUBL.U8	Q13,D26,D2
+	VLD1.64		{D28},[r1@64],r12
+	VHADD.U8	D4, D4, D5
+	VLD1.64		{D6}, [r2]
+	VSUBL.U8	Q14,D28,D4
+	VLD1.64		{D7}, [r3]
+	VHADD.U8	D6, D6, D7
+	VLD1.64		{D30},[r1@64]
+	VSUBL.U8	Q15,D30,D6
+	B	oc_int_frag_satd_neon
+	ENDP
+
+oc_enc_frag_intra_satd_neon PROC
+	; r0 = int                 *_dc
+	; r1 = const unsigned char *_src
+	; r2 = int                  _ystride
+	; Load and subtract 128 from src, expanding to 16 bits.
+	VMOV.I8		D0,#128
+	VLD1.64		{D16},[r1@64],r2
+	VSUBL.U8	Q8, D16,D0
+	VLD1.64		{D18},[r1@64],r2
+	VSUBL.U8	Q9, D18,D0
+	VLD1.64		{D20},[r1@64],r2
+	VSUBL.U8	Q10,D20,D0
+	VLD1.64		{D22},[r1@64],r2
+	VSUBL.U8	Q11,D22,D0
+	VLD1.64		{D24},[r1@64],r2
+	VSUBL.U8	Q12,D24,D0
+	VLD1.64		{D26},[r1@64],r2
+	VSUBL.U8	Q13,D26,D0
+	VLD1.64		{D28},[r1@64],r2
+	VSUBL.U8	Q14,D28,D0
+	VLD1.64		{D30},[r1@64]
+	VSUBL.U8	Q15,D30,D0
+	B	oc_int_frag_satd_neon
+	ENDP
+ ]
+
+	END

+ 162 - 0
modules/theoraplayer/native/theora/lib/arm/armenquant.s

@@ -0,0 +1,162 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_enc_enquant_table_init_neon
+	EXPORT	oc_enc_enquant_table_fixup_neon
+	EXPORT	oc_enc_quantize_neon
+
+oc_enc_enquant_table_init_neon PROC
+	; r0 = void               *_enquant
+	; r1 = const ogg_uint16_t  _dequant[64]
+	STMFD r13!,{r0,r14}
+	; Initialize the table using the C routine
+	BLX	oc_enc_enquant_table_init_c
+	LDR	r0, [r13],#4
+	MOV	r1, #2
+	; Now partially de-interleave it, so that the first row is all
+	;  multipliers, the second row is all shift factors, etc.
+	; Also, negate the shifts for use by VSHL.
+oeeti_neon_lp
+	SUBS	r1, r1, #1
+	VLDMIA		r0, {D16-D31}
+	VUZP.16		Q8, Q9
+	VNEG.S16	Q9, Q9
+	VUZP.16		Q10,Q11
+	VNEG.S16	Q11,Q11
+	VUZP.16		Q12,Q13
+	VNEG.S16	Q13,Q13
+	VUZP.16		Q14,Q15
+	VNEG.S16	Q15,Q15
+	VSTMIA		r0!,{D16-D31}
+	BNE	oeeti_neon_lp
+	LDR	PC, [r13],#4
+	ENDP
+
+oc_enc_enquant_table_fixup_neon PROC
+	; r0 = void *_enquant[3][3][2]
+	; r1 = int   _nqis
+	STR	r14,[r13,#-4]!
+oeetf_neon_lp1
+	SUBS	r1, r1, #1
+	BEQ	oeetf_neon_end1
+	MOV	r14,#3
+oeetf_neon_lp2
+	LDR	r2, [r0]
+	SUBS	r14,r14,#1
+	LDRH	r3, [r2]
+	LDRH	r12,[r2,#16]
+	LDR	r2, [r0,#8]
+	STRH	r3, [r2]
+	STRH	r12,[r2,#16]
+	LDR	r2, [r0,#4]
+	LDRH	r3, [r2]
+	LDRH	r12,[r2,#16]
+	LDR	r2, [r0,#12]
+	ADD	r0, r0, #24
+	STRH	r3, [r2]
+	STRH	r12,[r2,#16]
+	BNE	oeetf_neon_lp2
+	SUB	r0, r0, #64
+	B	oeetf_neon_lp1
+oeetf_neon_end1
+	LDR	PC, [r13],#4
+	ENDP
+
+oc_enc_quantize_neon PROC
+	; r0 = ogg_int16_t        _qdct[64]
+	; r1 = const ogg_int16_t  _dct[64]
+	; r2 = const ogg_int16_t  _dequant[64]
+	; r3 = const void        *_enquant
+	STMFD	r13!,{r4,r5,r14}
+	; The loop counter goes in the high half of r14.
+	MOV	r14,#0xFFFCFFFF
+oeq_neon_lp
+	; Load the next two rows of the data and the quant matrices.
+	VLD1.64		{D16,D17,D18,D19},[r1@128]!
+	VLD1.64		{D20,D21,D22,D23},[r2@128]!
+	; Add in the signed rounding bias from the quantizers.
+	; Note that the VHADD relies on the fact that the quantizers are all
+	;  even (they're in fact multiples of four) in order to round correctly
+	;  on the entries being negated.
+	VSHR.S16	Q0, Q8, #15
+	VSHR.S16	Q1, Q9, #15
+	VLD1.64		{D24,D25,D26,D27},[r3@128]!
+	VHADD.S16	Q10,Q0, Q10
+	VHADD.S16	Q11,Q1, Q11
+	VLD1.64		{D28,D29,D30,D31},[r3@128]!
+	ADDS	r14,r14,#1<<16
+	VEOR.S16	Q10,Q0, Q10
+	VEOR.S16	Q11,Q1, Q11
+	VADD.S16	Q8, Q8, Q10
+	VADD.S16	Q9, Q9, Q11
+	; Perform the actual division and save the result.
+	VQDMULH.S16	Q12,Q8, Q12
+	VQDMULH.S16	Q14,Q9, Q14
+	VADD.S16	Q8, Q8, Q8
+	VADD.S16	Q9, Q9, Q9
+	VADD.S16	Q8, Q8, Q12
+	VADD.S16	Q9, Q9, Q14
+	VSHL.S16	Q8, Q13
+	VSHL.S16	Q9, Q15
+	VSUB.S16	Q8, Q8, Q0
+	VSUB.S16	Q9, Q9, Q1
+	VST1.64		{D16,D17,D18,D19},[r0@128]!
+	; Now pull out a bitfield marking the non-zero coefficients.
+	VQMOVN.S16	D16,Q8
+	VQMOVN.S16	D17,Q9
+	VCEQ.S8		Q8, #0
+	; Sadly, NEON has no PMOVMSKB; emulating it requires 6 instructions.
+	VNEG.S8		Q8, Q8          ; D16=.......3.......2.......1.......0
+	                                ;     .......7.......6.......5.......4
+	                                ; D17=.......B.......A.......9.......8
+	                                ;     .......F.......E.......D.......C
+	VZIP.8		D16,D17         ; D16=.......9.......1.......8.......0
+	                                ;     .......B.......3.......A.......2
+	                                ; D17=.......D.......5.......C.......4
+	                                ;     .......F.......7.......E.......6
+	VSLI.8		D16,D17,#4      ; D16=...D...9...5...1...C...8...4...0
+	                                ;     ...F...B...7...3...E...A...6...2
+	; Shift over the bitfields from previous iterations and
+	;  finish compacting the bitfield from the last iteration.
+	ORR	r4, r4, r5, LSL #2      ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+	ORR	r4, r4, r4, LSR #15     ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+	PKHTB	r14,r14,r12,ASR #16     ; r14=i|A
+	PKHBT	r12,r4, r12,LSL #16     ; r12=B|C
+	VMOV		r4, r5, D16
+	BLT	oeq_neon_lp
+	; Start with the low half while the NEON register transfers.
+	PKHBT	r0, r14,r12             ; r0 =B|A
+	MVNS	r0, r0
+	CLZNE	r0, r0
+	RSBNE	r0, r0, #31
+	; Stall 8-10 more cycles waiting for the last transfer.
+	ORR	r4, r4, r5, LSL #2      ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+	ORR	r4, r4, r4, LSR #15     ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+	PKHBT	r1, r12,r4, LSL #16     ; r1 = D|C
+	MVNS	r1, r1
+	CLZNE	r1, r1
+	RSBNE	r0, r1, #63
+	LDMFD	r13!,{r4,r5,PC}
+	ENDP
+ ]
+
+	END

+ 656 - 0
modules/theoraplayer/native/theora/lib/arm/armfrag.asm

@@ -0,0 +1,656 @@
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.include "armopts-gnu.s"
+
+@ Vanilla ARM v4 versions
+	.global	oc_frag_copy_list_arm
+	.global	oc_frag_recon_intra_arm
+	.global	oc_frag_recon_inter_arm
+	.global	oc_frag_recon_inter2_arm
+
+	.type	oc_frag_copy_list_arm, %function; oc_frag_copy_list_arm: @ PROC
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		@ r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp:
+	LDR	r11,[r14,r4,LSL #2]	@ r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	@ Stall (on XScale)
+	ADD	r4, r1, r11		@ r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		@ r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		@ r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end:
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm:
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm:
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+	.size oc_frag_copy_list_arm, .-oc_frag_copy_list_arm	@ ENDP
+
+	.type	oc_frag_recon_inter_arm, %function; oc_frag_recon_inter_arm: @ PROC
+	@ r0 =       unsigned char *dst
+	@ r1 = const unsigned char *src
+	@ r2 =       int            ystride
+	@ r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm:
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+	.size oc_frag_recon_inter_arm, .-oc_frag_recon_inter_arm	@ ENDP
+
+	.type	oc_frag_recon_inter2_arm, %function; oc_frag_recon_inter2_arm: @ PROC
+	@ r0 =       unsigned char *dst
+	@ r1 = const unsigned char *src1
+	@ r2 = const unsigned char *src2
+	@ r3 =       int            ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm:
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+	.size oc_frag_recon_inter2_arm, .-oc_frag_recon_inter2_arm	@ ENDP
+
+  .if OC_ARM_ASM_EDSP
+	.global	oc_frag_copy_list_edsp
+
+	.type	oc_frag_copy_list_edsp, %function; oc_frag_copy_list_edsp: @ PROC
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		@ r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp:
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	@ r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	@ Stall (on XScale)
+	LDRD	r6, [r4, r5]!		@ r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	@ Stall
+	STRD	r6, [r5, r0]!		@ r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	@ Stall
+	LDRD	r6, [r4, r2]!	@ On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	@ loads causes a stall, but thats no worse
+	LDRD	r10,[r4, r2]!	@ than us only doing 2, and having to do
+				@ another pair of LDRD/STRD later on.
+	@ Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end:
+	LDMFD	r13!,{r4-r11,PC}
+	.size oc_frag_copy_list_edsp, .-oc_frag_copy_list_edsp	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_MEDIA
+	.global	oc_frag_recon_intra_v6
+	.global	oc_frag_recon_inter_v6
+	.global	oc_frag_recon_inter2_v6
+
+	.type	oc_frag_recon_intra_v6, %function; oc_frag_recon_intra_v6: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp:
+	LDRD	r2, [r12],#8	@ r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	@ r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		@ r2 = __11__00
+	USAT16	r3, #8, r3		@ r3 = __33__22
+	USAT16	r4, #8, r4		@ r4 = __55__44
+	USAT16	r5, #8, r5		@ r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	@ r2 = __111100
+	ORR	r3, r3, r3, LSR #8	@ r3 = __333322
+	ORR	r4, r4, r4, LSR #8	@ r4 = __555544
+	ORR	r5, r5, r5, LSR #8	@ r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     @ r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     @ r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+	.size oc_frag_recon_intra_v6, .-oc_frag_recon_intra_v6	@ ENDP
+
+	.type	oc_frag_recon_inter_v6, %function; oc_frag_recon_inter_v6: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src
+	@ r2 =       int            _ystride
+	@ r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp:
+	LDRD	r6, [r3], #8		@ r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+  .if OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	@ Unaligned ; r4 = 33221100 r5 = 77665544
+  .else
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+  .endif
+	PKHBT	r12,r6, r7, LSL #16	@ r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
+	UXTB16	r6,r4			@ r6 = __22__00
+	UXTB16	r4,r4, ROR #8		@ r4 = __33__11
+	QADD16	r12,r12,r6		@ r12= xx22xx00
+	QADD16	r4, r7, r4		@ r4 = xx33xx11
+	LDRD	r6, [r3], #8		@ r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		@ r4 = __33__11
+	USAT16	r12,#8,r12		@ r12= __22__00
+	ORR	r4, r12,r4, LSL #8	@ r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	@ r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 77775555
+	UXTB16	r6,r5			@ r6 = __66__44
+	UXTB16	r5,r5, ROR #8		@ r5 = __77__55
+	QADD16	r12,r12,r6		@ r12= xx66xx44
+	QADD16	r5, r7, r5		@ r5 = xx77xx55
+	USAT16	r12,#8, r12		@ r12= __66__44
+	USAT16	r5, #8, r5		@ r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	@ r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+	.size oc_frag_recon_inter_v6, .-oc_frag_recon_inter_v6	@ ENDP
+
+	.type	oc_frag_recon_inter2_v6, %function; oc_frag_recon_inter2_v6: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src1
+	@ r2 = const unsigned char *_src2
+	@ r3 =       int            _ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp:
+	LDRD	r6, [r12,#8]	@ r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	@ Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	@ Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	@ r9 = 77775555
+	UHADD8	r4, r4, r5	@ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			@ r5 = __66__44
+	UXTB16	r4, r4, ROR #8		@ r4 = __77__55
+	QADD16	r8, r8, r5		@ r8 = xx66xx44
+	QADD16	r9, r9, r4		@ r9 = xx77xx55
+	LDRD	r6,[r12],#16	@ r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		@ r8 = __66__44
+	LDR	r4, [r1], r3	@ Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		@ r9 = __77__55
+	LDR	r5, [r2], r3	@ Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	@ r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 22220000
+	UHADD8	r4, r4, r5	@ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
+	UXTB16	r5, r4			@ r5 = __22__00
+	UXTB16	r4, r4, ROR #8		@ r4 = __33__11
+	QADD16	r8, r8, r5		@ r8 = xx22xx00
+	QADD16	r7, r7, r4		@ r7 = xx33xx11
+	USAT16	r8, #8, r8		@ r8 = __22__00
+	USAT16	r7, #8, r7		@ r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	@ r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+	.size oc_frag_recon_inter2_v6, .-oc_frag_recon_inter2_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	oc_frag_copy_list_neon
+	.global	oc_frag_recon_intra_neon
+	.global	oc_frag_recon_inter_neon
+	.global	oc_frag_recon_inter2_neon
+
+	.type	oc_frag_copy_list_neon, %function; oc_frag_copy_list_neon: @ PROC
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		@ r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	@ Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
+	@ Stall (on XScale)
+	MOV	r7, r6			@ Guarantee PLD points somewhere valid.
+ofcl_neon_lp:
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4,:64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4,:64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4,:64], r2
+	LDRGT	r6, [r3,#4]!		@ r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4,:64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4,:64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4,:64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4,:64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4,:64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5,:64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5,:64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5,:64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5,:64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5,:64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5,:64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5,:64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5,:64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end:
+	LDMFD	r13!,{r4-r7,PC}
+	.size oc_frag_copy_list_neon, .-oc_frag_copy_list_neon	@ ENDP
+
+	.type	oc_frag_recon_intra_neon, %function; oc_frag_recon_intra_neon: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	VMOV.I16	Q0, #128
+	VLDMIA	r2,  {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	@ D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	@ D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	@ D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0,:64], r1
+	VQMOVUN.S16	D19,Q11	@ D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0,:64], r1
+	VQMOVUN.S16	D20,Q12	@ D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0,:64], r1
+	VQMOVUN.S16	D21,Q13	@ D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0,:64], r1
+	VQMOVUN.S16	D22,Q14	@ D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0,:64], r1
+	VQMOVUN.S16	D23,Q15	@ D23= !!,:@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0,:64], r1
+	VST1.64	{D22},[r0,:64], r1
+	VST1.64	{D23},[r0,:64], r1
+	MOV	PC,R14
+	.size oc_frag_recon_intra_neon, .-oc_frag_recon_intra_neon	@ ENDP
+
+	.type	oc_frag_recon_inter_neon, %function; oc_frag_recon_inter_neon: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src
+	@ r2 =       int            _ystride
+	@ r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	@ Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	@ etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0,:64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0,:64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0,:64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0,:64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0,:64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0,:64], r2
+	VST1.64	{D22},[r0,:64], r2
+	VST1.64	{D23},[r0,:64], r2
+	MOV	PC,R14
+	.size oc_frag_recon_inter_neon, .-oc_frag_recon_inter_neon	@ ENDP
+
+	.type	oc_frag_recon_inter2_neon, %function; oc_frag_recon_inter2_neon: @ PROC
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src1
+	@ r2 = const unsigned char *_src2
+	@ r3 =       int            _ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	@ Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		@ Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		@ etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0,:64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0,:64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0,:64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0,:64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0,:64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0,:64], r3
+	VST1.64	{D22},[r0,:64], r3
+	VST1.64	{D23},[r0,:64], r3
+	MOV	PC,R14
+	.size oc_frag_recon_inter2_neon, .-oc_frag_recon_inter2_neon	@ ENDP
+  .endif
+
+	@ END
+    .section	.note.GNU-stack,"",%progbits

+ 655 - 0
modules/theoraplayer/native/theora/lib/arm/armfrag.s

@@ -0,0 +1,655 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+; Vanilla ARM v4 versions
+	EXPORT	oc_frag_copy_list_arm
+	EXPORT	oc_frag_recon_intra_arm
+	EXPORT	oc_frag_recon_inter_arm
+	EXPORT	oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp
+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+	ENDP
+
+oc_frag_recon_inter_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src
+	; r2 =       int            ystride
+	; r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+	ENDP
+
+oc_frag_recon_inter2_arm PROC
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src1
+	; r2 = const unsigned char *src2
+	; r3 =       int            ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+	ENDP
+
+ [ OC_ARM_ASM_EDSP
+	EXPORT	oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	; Stall
+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	; Stall
+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
+				; another pair of LDRD/STRD later on.
+	; Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_frag_recon_intra_v6
+	EXPORT	oc_frag_recon_inter_v6
+	EXPORT	oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp
+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		; r2 = __11__00
+	USAT16	r3, #8, r3		; r3 = __33__22
+	USAT16	r4, #8, r4		; r4 = __55__44
+	USAT16	r5, #8, r5		; r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+	ENDP
+
+oc_frag_recon_inter_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp
+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+ ]
+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r6,r4			; r6 = __22__00
+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
+	QADD16	r12,r12,r6		; r12= xx22xx00
+	QADD16	r4, r7, r4		; r4 = xx33xx11
+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		; r4 = __33__11
+	USAT16	r12,#8,r12		; r12= __22__00
+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
+	UXTB16	r6,r5			; r6 = __66__44
+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
+	QADD16	r12,r12,r6		; r12= xx66xx44
+	QADD16	r5, r7, r5		; r5 = xx77xx55
+	USAT16	r12,#8, r12		; r12= __66__44
+	USAT16	r5, #8, r5		; r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_inter2_v6 PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp
+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			; r5 = __66__44
+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
+	QADD16	r8, r8, r5		; r8 = xx66xx44
+	QADD16	r9, r9, r4		; r9 = xx77xx55
+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		; r8 = __66__44
+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		; r9 = __77__55
+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r5, r4			; r5 = __22__00
+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
+	QADD16	r8, r8, r5		; r8 = xx22xx00
+	QADD16	r7, r7, r4		; r7 = xx33xx11
+	USAT16	r8, #8, r8		; r8 = __22__00
+	USAT16	r7, #8, r7		; r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_frag_copy_list_neon
+	EXPORT	oc_frag_recon_intra_neon
+	EXPORT	oc_frag_recon_inter_neon
+	EXPORT	oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon PROC
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	; Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	; Stall (on XScale)
+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
+ofcl_neon_lp
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4@64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4@64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4@64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4@64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4@64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4@64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4@64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4@64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5@64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5@64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5@64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5@64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5@64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5@64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r7,PC}
+	ENDP
+
+oc_frag_recon_intra_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	VMOV.I16	Q0, #128
+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0@64], r1
+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0@64], r1
+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0@64], r1
+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0@64], r1
+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0@64], r1
+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0@64], r1
+	VST1.64	{D22},[r0@64], r1
+	VST1.64	{D23},[r0@64], r1
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	; etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r2
+	VST1.64	{D22},[r0@64], r2
+	VST1.64	{D23},[r0@64], r2
+	MOV	PC,R14
+	ENDP
+
+oc_frag_recon_inter2_neon PROC
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		; etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0@64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0@64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0@64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0@64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0@64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0@64], r3
+	VST1.64	{D22},[r0@64], r3
+	VST1.64	{D23},[r0@64], r3
+	MOV	PC,R14
+	ENDP
+ ]
+
+	END

+ 1854 - 0
modules/theoraplayer/native/theora/lib/arm/armidct.asm

@@ -0,0 +1,1854 @@
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.include "armopts-gnu.S"
+
+	.global	oc_idct8x8_1_arm
+	.global	oc_idct8x8_arm
+
+	.type	oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+	.size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm	@ ENDP
+
+	.type	oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	@ Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	SUB	r2, r1, #8*16
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	MOV	r1, r13		@ And read from temp storage.
+@ Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	.size oc_idct8x8_arm, .-oc_idct8x8_arm	@ ENDP
+
+	.type	oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-4*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#20]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+oc_idct8x8_10_arm_cols:
+@ Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	.size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm	@ ENDP
+
+	.type	oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-3*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+@ Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	.size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm	@ ENDP
+
+	.type	oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+@ Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	.size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm	@ ENDP
+
+	.type	idct1core_arm, %function; idct1core_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	@ Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+	.size idct1core_arm, .-idct1core_arm	@ ENDP
+
+	.type	idct2core_arm, %function; idct2core_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		@ r3 = t[4]
+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		@ r10= t[5]
+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]
+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]
+	ADD	r11,r11,r9		@ r11= t[0]+t[7]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		@ y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		@ y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		@ y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	@ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	@ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	@ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	@ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
+	MOV	PC,r14
+	.size idct2core_arm, .-idct2core_arm	@ ENDP
+
+	.type	idct2core_down_arm, %function; idct2core_down_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MOV	r3, r3, ASR #16		@ r3 = t[4]
+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		@ r10= t[5]
+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		@ r11= t[0]+t[7]+8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	@r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	@r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	@r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	@r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		@ y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		@ y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
+	MOV	PC,r14
+	.size idct2core_down_arm, .-idct2core_down_arm	@ ENDP
+
+	.type	idct3core_arm, %function; idct3core_arm: @ PROC
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]
+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		@ r4 = t[4]
+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	@ r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2] = t[0]-t[2]
+					@ r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		@ r12= t[6]
+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		@ r11= t2[0]+t[7]
+	ADD	r5, r10,r5		@ r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		@ r12= t[2]+t2[5]
+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	@ r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	@ r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	@ r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	@ r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	.size idct3core_arm, .-idct3core_arm	@ ENDP
+
+	.type	idct3core_down_arm, %function; idct3core_down_arm: @ PROC
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		@ r4 = t[4]
+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	@ r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2]+8 = t[0]-t[2]+8
+					@ r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		@ r12= t[6]
+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		@ r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		@ r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		@ r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	@ r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	@ r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	@ r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	@ r4 = t2[3] - t[4]  + 8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	.size idct3core_down_arm, .-idct3core_down_arm	@ ENDP
+
+	.type	idct4core_arm, %function; idct4core_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		@ r6 = t[4]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		@ r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		@ r11= t[0]+t2[7]
+	ADD	r6, r4, r6		@ r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		@ r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		@ r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		@ y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	@ r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	@ r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	@ r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	@ r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	.size idct4core_arm, .-idct4core_arm	@ ENDP
+
+	.type	idct4core_down_arm, %function; idct4core_down_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		@ r6 = t[4]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		@ r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		@ r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		@ r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		@ r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		@ r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	@ r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	@ r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	@ r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	@ r7 = t[3]-t2[4]+8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		@ y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	.size idct4core_down_arm, .-idct4core_down_arm	@ ENDP
+
+	.type	idct8core_arm, %function; idct8core_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		@ r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
+	LDR	r12,OC_C4S4		@ r12= C4S4
+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
+	@ For spec compliance, these sums must be truncated to 16-bit precision
+	@ _before_ the multiply (not after).
+	@ Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	@ r10=t[6] r12=C4S4 r14=t[5]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	@ Stage 2
+	@ 4-5 butterfly
+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	@ 7-6 butterfly
+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	@ Stage 3
+	@ 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3]
+	@ 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2]
+	@ 6-5 butterfly
+	MOV	r14,r14,ASR #16		@ r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	@ r10=t3[6] r14=t3[5]
+	@ Stage 4
+	ADD	r2, r2, r8		@ r2 = t[0] + t[7]
+	ADD	r6, r6, r10		@ r6 = t[1] + t[6]
+	ADD	r3, r3, r14		@ r3 = t[2] + t[5]
+	ADD	r4, r4, r9		@ r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	.size idct8core_arm, .-idct8core_arm	@ ENDP
+
+	.type	idct8core_down_arm, %function; idct8core_down_arm: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		@ r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
+	LDR	r12,OC_C4S4		@ r12= C4S4
+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
+	@ For spec compliance, these sums must be truncated to 16-bit precision
+	@ _before_ the multiply (not after).
+	@ Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	@ r10=t[6] r12=C4S4 r14=t[5]
+	@ Stage 2
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	@ 4-5 butterfly
+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	@ 7-6 butterfly
+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	@ Stage 3
+	ADD	r2, r2, #8<<16		@ r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		@ r6 = t[1]+8<<16
+	@ 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3] + 8
+	@ 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2] + 8
+	@ 6-5 butterfly
+	MOV	r14,r14,ASR #16		@ r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	@ r10=t3[6] r14=t3[5]
+	@ Stage 4
+	ADD	r2, r2, r8		@ r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		@ r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		@ r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		@ r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4] + 8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	.size idct8core_down_arm, .-idct8core_down_arm	@ ENDP
+
+  .if OC_ARM_ASM_MEDIA
+	.global	oc_idct8x8_1_v6
+	.global	oc_idct8x8_v6
+
+	.type	oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	MOV	PC, r14
+	.size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6	@ ENDP
+
+	.type	oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	@CMP	r2, #6
+	@BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	@ Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-8*16]!
+	STRD	r4, [r1,#8]
+	STRD	r4, [r1,#16]
+	STRD	r4, [r1,#24]
+	STRD	r4, [r1,#32]
+	STRD	r4, [r1,#40]
+	STRD	r4, [r1,#48]
+	STRD	r4, [r1,#56]
+	STRD	r4, [r1,#64]
+	STRD	r4, [r1,#72]
+	STRD	r4, [r1,#80]
+	STRD	r4, [r1,#88]
+	STRD	r4, [r1,#96]
+	STRD	r4, [r1,#104]
+	STRD	r4, [r1,#112]
+	STRD	r4, [r1,#120]
+	MOV	r1, r13		@ And read from temp storage.
+@ Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	.size oc_idct8x8_v6, .-oc_idct8x8_v6	@ ENDP
+
+	.type	oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+@ Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	@ Align the stack.
+	ADD	r0, r0, r2	@ Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-4*16]!
+	STRD	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	AND	r1, r13,#4	@ Align the stack.
+	ADD	r1, r1, r13	@ And read from temp storage.
+@ Column transforms
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+	.size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6	@ ENDP
+
+	.type	oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct2_1core_v6
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r8		@ Write to the final destination.
+@ Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+	.size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6	@ ENDP
+
+	.type	idct2_1core_v6, %function; idct2_1core_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		@ r6 = x[1,0]
+	SMULWB	r12,r3, r2		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		@ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+@ Stage 2:
+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		@ r7 = <0|t[0,7]>
+@ Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	@ r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		@ r4 = <0|t[0,4]>
+	SADDSUBX	r5, r5, r5		@ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+@ Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	@ r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		@ r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		@ r3 = t[0]+t[7]
+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		@ r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		@ r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		@ r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		@ y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		@ y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		@ y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	.size idct2_1core_v6, .-idct2_1core_v6	@ ENDP
+  .endif
+
+	.balign 8
+OC_C7S1:
+	.word	12785 @ 31F1
+OC_C1S7:
+	.word	64277 @ FB15
+OC_C6S2:
+	.word	25080 @ 61F8
+OC_C2S6:
+	.word	60547 @ EC83
+OC_C5S3:
+	.word	36410 @ 8E3A
+OC_C3S5:
+	.word	54491 @ D4DB
+OC_C4S4:
+	.word	46341 @ B505
+
+  .if OC_ARM_ASM_MEDIA
+	.type	idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			@ r7  = 8
+	LDR	r6, [r1], #16		@ r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		@ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		@ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+@ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	@ r7 = <t[0,7]|t[0,7]>
+@ Stage 2:
+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	@ r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		@ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		@ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	@ r2 = <t[1,5]|t[0,5]>
+@ Stage 3:
+	SSUB16	r5, r6, r2		@ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		@ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+@ Stage 4:
+	SADD16	r2, r12,r7		@ r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		@ r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		@ r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		@ r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	@ r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		@ y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	@ r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		@ y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	@ r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		@ y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	@ r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	.size idct2_2core_down_v6, .-idct2_2core_down_v6	@ ENDP
+
+@ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+@  pay for increased branch mis-prediction to get here, but in practice it
+@  doesn't seem to slow anything down to take it out, and it's less code this
+@  way.
+  .if 0
+	.type	oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+@ Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	@ Align the stack.
+	ADD	r0, r0, r13	@ Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-3*16]!
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	AND	r1, r13,#4	@ Align the stack.
+	MOV	r0, r8		@ Write to the final destination.
+	ADD	r1, r1, r13	@ And read from temp storage.
+@ Column transforms
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+	.size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6	@ ENDP
+
+	.type	idct1core_v6, %function; idct1core_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	@ Stall ?
+	MOV	r3, r3, ASR #16
+	@ Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+	.size idct1core_v6, .-idct1core_v6	@ ENDP
+
+	.type	idct3_2core_v6, %function; idct3_2core_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r4, [r1], #16		@ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,OC_C6S2_3_v6	@ r10= OC_C6S2; r11= OC_C2S6
+	@ Stall
+	SMULWB	r3, r11,r5		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		@ r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		@ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		@ r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		@ r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		@ r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		@ r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+@ Stage 2:
+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
+@ Stage 3:
+	B	idct4_3core_stage3_v6
+	.size idct3_2core_v6, .-idct3_2core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_3_v6:
+	.word	12785 @ 31F1
+OC_C1S7_3_v6:
+	.word	64277 @ FB15
+OC_C6S2_3_v6:
+	.word	25080 @ 61F8
+OC_C2S6_3_v6:
+	.word	60547 @ EC83
+
+	.type	idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,[r1], #16		@ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, OC_C6S2_3_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+@ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	@ r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	@ r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
+@ Stage 3:
+	B	idct4_4core_down_stage3_v6
+	.size idct3_3core_down_v6, .-idct3_3core_down_v6	@ ENDP
+  .endif
+
+	.type	idct4_3core_v6, %function; idct4_3core_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		@ r9 = <0|t[0,6]>
+	LDRD	r6, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		@ r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		@ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
+@ Stage 3:
+idct4_3core_stage3_v6:
+	SADD16	r11,r12,r2		@ r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		@ r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6:
+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		@ r10= t[0]=t[0]+t[3]
+	SSUB16	r3, r12,r3		@ r3 = t[3]=t[0]-t[3]
+@ Stage 4:
+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]
+	STR	r12,[r0], #4		@ y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]
+	STR	r12,[r0, #12]		@ y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]
+	STR	r12,[r0, #28]		@ y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]
+	STR	r12,[r0, #44]		@ y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		@ y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		@ y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		@ y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	.size idct4_3core_v6, .-idct4_3core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_4_v6:
+	.word	12785 @ 31F1
+OC_C1S7_4_v6:
+	.word	64277 @ FB15
+OC_C6S2_4_v6:
+	.word	25080 @ 61F8
+OC_C2S6_4_v6:
+	.word	60547 @ EC83
+OC_C5S3_4_v6:
+	.word	36410 @ 8E3A
+OC_C3S5_4_v6:
+	.word	54491 @ D4DB
+
+	.type	idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16	@ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+@ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	@ r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	@ r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
+@ Stage 3:
+idct4_4core_down_stage3_v6:
+	SADD16	r11,r12,r2		@ r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		@ r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+	.size idct4_4core_down_v6, .-idct4_4core_down_v6	@ ENDP
+
+	.type	idct8_8core_v6, %function; idct8_8core_v6: @ PROC
+	STMFD	r13!,{r0,r14}
+@ Stage 1:
+	@5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_4_v6	@ r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
+	@2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_4_v6	@ r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
+	@4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
+	@0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		@ r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		@ r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		@ r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		@ r8 = t[1,1]=OC_C4S4*r4T>>16
+@ Stage 2:
+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
+@ Stage 3:
+	SADD16	r11,r8, r2		@ r11= t[1]=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		@ r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+	.size idct8_8core_v6, .-idct8_8core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_8_v6:
+	.word	12785 @ 31F1
+OC_C1S7_8_v6:
+	.word	64277 @ FB15
+OC_C6S2_8_v6:
+	.word	25080 @ 61F8
+OC_C2S6_8_v6:
+	.word	60547 @ EC83
+OC_C5S3_8_v6:
+	.word	36410 @ 8E3A
+OC_C3S5_8_v6:
+	.word	54491 @ D4DB
+
+	.type	idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
+	STMFD	r13!,{r0,r14}
+@ Stage 1:
+	@5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_8_v6	@ r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
+	@2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_8_v6	@ r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
+	@4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
+	@0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		@ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		@ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		@ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		@ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+@ Stage 2:
+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
+@ Stage 3:
+	SADD16	r11,r8, r2		@ r11= t[1]+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		@ r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6:
+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		@ r10= t[0]+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		@ r3 = t[3]+8=t[0]-t[3]+8
+@ Stage 4:
+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		@ y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		@ y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		@ y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	@ r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		@ y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	@ r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		@ y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	@ r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		@ y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	@ r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	.size idct8_8core_down_v6, .-idct8_8core_down_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	oc_idct8x8_1_neon
+	.global	oc_idct8x8_neon
+
+	.balign 16
+OC_IDCT_CONSTS_NEON:
+	.short	    8
+	.short	64277 @ FB15 (C1S7)
+	.short	60547 @ EC83 (C2S6)
+	.short	54491 @ D4DB (C3S5)
+	.short	46341 @ B505 (C4S4)
+	.short	36410 @ 471D (C5S3)
+	.short	25080 @ 30FC (C6S2)
+	.short	12785 @ 31F1 (C7S1)
+
+	.type	oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]
+	MOV	PC, r14
+	.size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon	@ ENDP
+
+	.type	oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon:
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	@ Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2,:128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2,:128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2,:128]!
+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2,:128]
+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3,:128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+@ Stage 4
+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
+	@ 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	@ Column transforms
+	BL	oc_idct8x8_stage123_neon
+	@ We have to put the return address back in the LR, or the branch
+	@  predictor will not recognize the function return and mis-predict the
+	@  entire call stack.
+	MOV	r14, r12
+@ Stage 4
+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]
+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	.size oc_idct8x8_neon, .-oc_idct8x8_neon	@ ENDP
+
+	.type	oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
+@ Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	@ Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	@ Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	@ Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	@ Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	@ Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	@ Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	@ Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	@ Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	@ Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	@ Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	@ Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	@ Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	@ Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	@ Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	@ Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	@ Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	@ Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	@ Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	@ Q4 = t[4]=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	@ Q7 = t[7]=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	@ Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	@ Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	@ Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	@ Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	@ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	@ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
+@ Stage 3
+	VSUB.S16	Q11,Q8, Q3	@ Q11 = t[3]=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	@ Q8  = t[0]=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	@ Q9  = t[1]=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	@ Q3  = t[6]=t[6]+t[5]
+	VSUB.S16	Q10,Q1, Q2	@ Q10 = t[2]=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	@ Q5  = t[5]=t[6]-t[5]
+	MOV	PC, r14
+	.size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon	@ ENDP
+
+	.type	oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3,:128]
+	MOV	r2, r1
+	@ Row transforms (input is pre-transposed)
+@ Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2,:128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	@ Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2,:64], r12
+	VMULL.S16	Q2, D18,D0[1]	@ Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2,:64]
+	VMULL.S16	Q14,D17,D0[2]	@ Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	@ Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	@ Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	@ Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	@ Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	@ D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	@ D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	@ D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	@ D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	@ D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	@ D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	@ D2 = t[2]
+	VADD.S16	D4, D4, D18	@ D4 = t[7]
+	VADD.S16	D6, D6, D19	@ D6 = t[6]
+	VADD.S16	D7, D7, D19	@ D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	@ D30= t[0]
+					@ D31= t[3]
+@ Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	@ D24= t[7]-t[6]
+					@ D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	@ D26= t[7]=t[7]+t[6]
+					@ D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	@ Q11= OC_C4S4*(t[7]-t[6])
+					@       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	@ Q14= OC_C4S4*(t[4]-t[5])
+					@       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	@ D16= t[0]=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	@ D17= t[2]=t[0]-t[2]
+	VADD.S16	D18,D30,D2	@ D18= t[1]=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	@ D22= (OC_C4S4*(t[7]-t[6])>>16)
+					@       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	@ D23= (OC_C4S4*(t[4]-t[5])>>16)
+					@       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	@ D19= t[3]=t[0]-t[3]
+	VADD.S16	D22,D22,D24	@ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	@ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	@ D27= t[5]=t[6]-t[5]
+	VADD.S16	D24,D22,D23	@ D24= t[6]=t[6]+t[5]
+@ Stage 4
+	VSUB.S16	Q11,Q8, Q13	@ D22= y[7]=t[0]-t[7]
+					@ D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	@ D20= y[6]=t[1]-t[6]
+					@ D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	@ D16= y[0]=t[0]+t[7]
+					@ D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	@ D18= y[1]=t[1]-t[6]
+					@ D19= y[3]=t[3]'-t[4]''
+	@ 8x4 transpose
+	VTRN.16		Q10,Q11		@ Q10= c5c4a5a4 c7c6a7a6
+					@ Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		@ Q8 = c3c2a3a2 c1c0a1a0
+					@ Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		@ Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		@ Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		@ Q9 = b7b6b5b4 b3b2b1b0
+					@ Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		@ Q8 = a7a6a5a4 a3a2a1a0
+					@ Q10= c7c6c5c4 c3c2c1c0
+	@ Column transforms
+@ Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	@ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	@ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	@  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	@ Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	@ Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	@ Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	@ Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	@ Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	@ Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	@  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	@  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	@ Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	@ Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	@  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	@ Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	@ Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	@ Q15= t[7]=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	@ Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	@ Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	@ Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	@ Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q10= OC_C4S4*(t[4]-t[5])
+					@           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	@ Q12= t[4]=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	@ Q13:Q14= OC_C4S4*(t[6]-t[7])
+					@           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	@ Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					@       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	@ Q11= t[0]=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	@ Q3 = t[3]=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	@ Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					@       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	@ Q10=t[5]
+	VADD.S16	Q14,Q14,Q9	@ Q14=t[6]
+	VSUB.S16	Q13,Q14,Q10	@ Q13=t[5]=t[6]-t[5]
+	VADD.S16	Q14,Q14,Q10	@ Q14=t[6]=t[6]+t[5]
+	VADD.S16	Q10,Q1, Q2	@ Q10= t[1]=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	@ Q2 = t[2]=t[0]-t[2]
+@ Stage 4
+	VADD.S16	Q8, Q11,Q15	@ Q8  = y[0]=t[0]+t[7]
+	VADD.S16	Q9, Q10,Q14	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q15,Q11,Q15	@ Q15 = y[7]=t[0]-t[7]
+	VSUB.S16	Q14,Q10,Q14	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q10,Q2, Q13	@ Q10 = y[2]=t[2]+t[5]
+	VADD.S16	Q11,Q3, Q12	@ Q11 = y[3]=t[3]+t[4]
+	VSUB.S16	Q12,Q3, Q12	@ Q12 = y[4]=t[3]-t[4]
+	VSUB.S16	Q13,Q2, Q13	@ Q13 = y[5]=t[2]-t[5]
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1,:64]
+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	.size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon	@ ENDP
+  .endif
+
+	@ END
+    .section	.note.GNU-stack,"",%progbits

+ 1853 - 0
modules/theoraplayer/native/theora/lib/arm/armidct.s

@@ -0,0 +1,1853 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_idct8x8_1_arm
+	EXPORT	oc_idct8x8_arm
+
+oc_idct8x8_1_arm PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_arm PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	; Write to the final destination.
+	SUB	r2, r1, #8*16
+	; Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	MOV	r1, r13		; And read from temp storage.
+; Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_10_arm PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-4*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#20]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	MOV	r1, r13		; Read from temp storage.
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_10_arm_cols
+; Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_6_arm PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-3*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	MOV	r1, r13		; Read from temp storage.
+	MOV	r0, r2		; Write to the final destination
+; Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
+
+oc_idct8x8_3_arm PROC
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		; Read from temp storage.
+	MOV	r0, r2		; Write to the final destination
+; Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
+
+idct1core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+	ENDP
+
+idct2core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]
+	ADD	r11,r11,r9		; r11= t[0]+t[7]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct2core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct3core_arm PROC
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
+					; r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7]
+	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		; r12= t[2]+t2[5]
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	ENDP
+
+idct3core_down_arm PROC
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
+					; r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	ENDP
+
+idct4core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		; r11= t[0]+t2[7]
+	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct4core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+idct8core_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; Stage 2
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7]
+	ADD	r6, r6, r10		; r6 = t[1] + t[6]
+	ADD	r3, r3, r14		; r3 = t[2] + t[5]
+	ADD	r4, r4, r9		; r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	ENDP
+
+idct8core_down_arm PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+	; Stage 2
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	ENDP
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_idct8x8_1_v6
+	EXPORT	oc_idct8x8_v6
+
+oc_idct8x8_1_v6 PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_v6 PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	;CMP	r2, #6
+	;BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-8*16]!
+	STRD	r4, [r1,#8]
+	STRD	r4, [r1,#16]
+	STRD	r4, [r1,#24]
+	STRD	r4, [r1,#32]
+	STRD	r4, [r1,#40]
+	STRD	r4, [r1,#48]
+	STRD	r4, [r1,#56]
+	STRD	r4, [r1,#64]
+	STRD	r4, [r1,#72]
+	STRD	r4, [r1,#80]
+	STRD	r4, [r1,#88]
+	STRD	r4, [r1,#96]
+	STRD	r4, [r1,#104]
+	STRD	r4, [r1,#112]
+	STRD	r4, [r1,#120]
+	MOV	r1, r13		; And read from temp storage.
+; Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_10_v6 PROC
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	; Align the stack.
+	ADD	r0, r0, r2	; Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-4*16]!
+	STRD	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	AND	r1, r13,#4	; Align the stack.
+	ADD	r1, r1, r13	; And read from temp storage.
+; Column transforms
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_idct8x8_3_v6 PROC
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2_1core_v6
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		; Read from temp storage.
+	MOV	r0, r8		; Write to the final destination.
+; Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+	ENDP
+
+idct2_1core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		; r6 = x[1,0]
+	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+; Stage 2:
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
+; Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
+	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+; Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+ ]
+
+	ALIGN 8
+OC_C7S1
+	DCD	12785 ; 31F1
+OC_C1S7
+	DCD	64277 ; FB15
+OC_C6S2
+	DCD	25080 ; 61F8
+OC_C2S6
+	DCD	60547 ; EC83
+OC_C5S3
+	DCD	36410 ; 8E3A
+OC_C3S5
+	DCD	54491 ; D4DB
+OC_C4S4
+	DCD	46341 ; B505
+
+ [ OC_ARM_ASM_MEDIA
+idct2_2core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			; r7  = 8
+	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
+; Stage 2:
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
+; Stage 3:
+	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+; Stage 4:
+	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	ENDP
+
+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+;  pay for increased branch mis-prediction to get here, but in practice it
+;  doesn't seem to slow anything down to take it out, and it's less code this
+;  way.
+ [ 0
+oc_idct8x8_6_v6 PROC
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	; Align the stack.
+	ADD	r0, r0, r13	; Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	; Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r1,#-3*16]!
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	AND	r1, r13,#4	; Align the stack.
+	MOV	r0, r8		; Write to the final destination.
+	ADD	r1, r1, r13	; And read from temp storage.
+; Column transforms
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+	ENDP
+
+idct1core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	; Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+	ENDP
+
+idct3_2core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
+	; Stall
+	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_3core_stage3_v6
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_3_v6
+	DCD	12785 ; 31F1
+OC_C1S7_3_v6
+	DCD	64277 ; FB15
+OC_C6S2_3_v6
+	DCD	25080 ; 61F8
+OC_C2S6_3_v6
+	DCD	60547 ; EC83
+
+idct3_3core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_4core_down_stage3_v6
+	ENDP
+ ]
+
+idct4_3core_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_3core_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
+	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]
+	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]
+	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]
+	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]
+	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_4_v6
+	DCD	12785 ; 31F1
+OC_C1S7_4_v6
+	DCD	64277 ; FB15
+OC_C6S2_4_v6
+	DCD	25080 ; 61F8
+OC_C2S6_4_v6
+	DCD	60547 ; EC83
+OC_C5S3_4_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_4_v6
+	DCD	54491 ; D4DB
+
+idct4_4core_down_v6 PROC
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_4core_down_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+	ENDP
+
+idct8_8core_v6 PROC
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+	ENDP
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_8_v6
+	DCD	12785 ; 31F1
+OC_C1S7_8_v6
+	DCD	64277 ; FB15
+OC_C6S2_8_v6
+	DCD	25080 ; 61F8
+OC_C2S6_8_v6
+	DCD	60547 ; EC83
+OC_C5S3_8_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_8_v6
+	DCD	54491 ; D4DB
+
+idct8_8core_down_v6 PROC
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_idct8x8_1_neon
+	EXPORT	oc_idct8x8_neon
+
+	ALIGN 16
+OC_IDCT_CONSTS_NEON
+	DCW	    8
+	DCW	64277 ; FB15 (C1S7)
+	DCW	60547 ; EC83 (C2S6)
+	DCW	54491 ; D4DB (C3S5)
+	DCW	46341 ; B505 (C4S4)
+	DCW	36410 ; 471D (C5S3)
+	DCW	25080 ; 30FC (C6S2)
+	DCW	12785 ; 31F1 (C7S1)
+
+oc_idct8x8_1_neon PROC
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
+	VST1.64		{D0, D1, D2, D3}, [r0@128]
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_neon PROC
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	; Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2@128]
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3@128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	; 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	; Column transforms
+	BL	oc_idct8x8_stage123_neon
+	; We have to put the return address back in the LR, or the branch
+	;  predictor will not recognize the function return and mis-predict the
+	;  entire call stack.
+	MOV	r14, r12
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1@128]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_stage123_neon PROC
+; Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
+; Stage 3
+	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
+	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
+	MOV	PC, r14
+	ENDP
+
+oc_idct8x8_10_neon PROC
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3@128]
+	MOV	r2, r1
+	; Row transforms (input is pre-transposed)
+; Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2@128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2@64], r12
+	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2@64]
+	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
+	VADD.S16	D4, D4, D18	; D4 = t[7]
+	VADD.S16	D6, D6, D19	; D6 = t[6]
+	VADD.S16	D7, D7, D19	; D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	; D30= t[0]
+					; D31= t[3]
+; Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
+					; D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
+					; D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
+					;       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
+					;       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
+	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
+	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
+	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
+; Stage 4
+	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
+					; D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
+					; D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
+					; D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
+					; D19= y[3]=t[3]'-t[4]''
+	; 8x4 transpose
+	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
+					; Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
+					; Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
+					; Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
+					; Q10= c7c6c5c4 c3c2c1c0
+	; Column transforms
+; Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
+					;           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
+					;           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
+	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
+	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
+	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
+	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
+; Stage 4
+	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
+	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
+	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
+	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
+	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
+	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1@64], r12
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1@64]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	ENDP
+ ]
+
+	END

+ 126 - 0
modules/theoraplayer/native/theora/lib/arm/armint.h

@@ -0,0 +1,126 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+#  if defined(__ARMEB__)
+#   error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+#  endif
+
+#  define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+   of the things that depend on structure offsets.
+  We reuse the function pointer with the wrong prototype, though.*/
+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+  ((oc_loop_filter_frag_rows_arm_func) \
+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+   (_bv), \
+   (_state)->frags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset, \
+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+   (_state)->frag_buf_offs, \
+   (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#  if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+#   if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#    if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

+ 677 - 0
modules/theoraplayer/native/theora/lib/arm/armloop.asm

@@ -0,0 +1,677 @@
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.include "armopts-gnu.S"
+
+	.global	oc_loop_filter_frag_rows_arm
+
+@ Which bit this is depends on the order of packing within a bitfield.
+@ Hopefully that doesn't change among any of the relevant compilers.
+ .set OC_FRAG_CODED_FLAG,	1
+
+	@ Vanilla ARM v4 version
+	.type	loop_filter_h_arm, %function; loop_filter_h_arm: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp:
+	LDRB	r3, [r0, #-2]		@ r3 = _pix[0]
+	LDRB	r12,[r0, #1]		@ r12= _pix[3]
+	LDRB	r4, [r0, #-1]		@ r4 = _pix[1]
+	LDRB	r5, [r0]		@ r5 = _pix[2]
+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	@ Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+	.size loop_filter_h_arm, .-loop_filter_h_arm	@ ENDP
+
+	.type	loop_filter_v_arm, %function; loop_filter_v_arm: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp:
+	LDRB	r3, [r0, -r1, LSL #1]	@ r3 = _pix[0]
+	LDRB	r12,[r0, r1]		@ r12= _pix[3]
+	LDRB	r4, [r0, -r1]		@ r4 = _pix[1]
+	LDRB	r5, [r0]		@ r5 = _pix[2]
+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	@ Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+	.size loop_filter_v_arm, .-loop_filter_v_arm	@ ENDP
+
+	.type	oc_loop_filter_frag_rows_arm, %function; oc_loop_filter_frag_rows_arm: @ PROC
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	@ _bv += 127
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	@			  bail
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_arm_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		@ r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	.size oc_loop_filter_frag_rows_arm, .-oc_loop_filter_frag_rows_arm	@ ENDP
+
+  .if OC_ARM_ASM_MEDIA
+	.global	oc_loop_filter_init_v6
+	.global	oc_loop_filter_frag_rows_v6
+
+	.type	oc_loop_filter_init_v6, %function; oc_loop_filter_init_v6: @ PROC
+	@ r0 = _bv
+	@ r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		@ r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		@ r1 = ll=r10x0xFF
+	ORR	r1, r1, r1, LSL #8	@ r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	@ r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+	.size oc_loop_filter_init_v6, .-oc_loop_filter_init_v6	@ ENDP
+
+@ We could use the same strategy as the v filter below, but that would require
+@  40 instructions to load the data and transpose it into columns and another
+@  32 to write out the results at the end, plus the 52 instructions to do the
+@  filtering itself.
+@ This is slightly less, and less code, even assuming we could have shared the
+@  52 instructions in the middle with the other function.
+@ It executes slightly fewer instructions than the ARMv6 approach David Conrad
+@  proposed for FFmpeg, but not by much:
+@  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+@ His is a lot less code, though, because it only does two rows at once instead
+@  of four.
+	.type	loop_filter_h_v6, %function; loop_filter_h_v6: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+	.size loop_filter_h_v6, .-loop_filter_h_v6	@ ENDP
+
+	.type	loop_filter_h_core_v6, %function; loop_filter_h_core_v6: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ r12= 0x10003
+	@ Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		@ r4 = <p3|p2|p1|p0>
+	@ Single issue
+	LDR	r5,[r0, r1]!		@ r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		@ r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		@ r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		@ r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		@ r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	@ r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	@ r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		@ r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		@ r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		@ r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		@ r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		@ r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	@ r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		@ r6 = <r0|r2>
+	UXTB16	r11,r11			@ r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		@ r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		@ r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	@ r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		@ r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		@ r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		@ r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	@ r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		@ r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	@ r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	@ r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	@ r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			@ r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	@ r6 = <-R_s|-R_q|-R_r|-R_p>
+	@ Single issue
+	@ There's no min, max or abs instruction.
+	@ SSUB8 and SEL will work for abs, and we can do all the rest with
+	@  unsigned saturated adds, which means the GE flags are still all
+	@  set when we're done computing lflim(abs(R_i),L).
+	@ This allows us to both add and subtract, and split the results by
+	@  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	@ Single issue
+	SEL	r7, r7, r6		@ r7 = abs(R_i)
+	@ Single issue
+	UQADD8	r4, r7, r2		@ r4 = 255-max(2*L-abs(R_i),0)
+	@ Single issue
+	UQADD8	r7, r7, r4
+	@ Single issue
+	UQSUB8	r7, r7, r4		@ r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	@ Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		@ r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		@ r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		@ r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		@ r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		@ r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		@ r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		@ r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		@ r4 = q1
+	STRB	r4, [r0,#-1]
+	@ Single issue
+	STRB	r9, [r0,-r1]!
+	@ Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+	.size loop_filter_h_core_v6, .-loop_filter_h_core_v6	@ ENDP
+
+@ This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+@  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+@ This works just as well, with the following procedure for computing the
+@  filter value, f:
+@   u = ~UHADD8(p1,~p2);
+@   v = UHADD8(~p1,p2);
+@   m = v-u;
+@   a = m^UHADD8(m^p0,m^~p3);
+@   f = UHADD8(UHADD8(a,u1),v1);
+@  where f = 127+R, with R in [-127,128] defined as in the spec.
+@ This is exactly the same amount of arithmetic as the version that uses PAVGB
+@  as the basic operator.
+@ It executes about 2/3 the number of instructions of David Conrad's approach,
+@  but requires more code, because it does all eight columns at once, instead
+@  of four at a time.
+	.type	loop_filter_v_v6, %function; loop_filter_v_v6: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		@ r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		@ r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		@ r9, r8 = <p6|p2>
+	MVN	r14,r6			@ r14= ~p1
+	LDRD	r10,[r0, r1]		@ r11,r10= <p7|p3>
+	@ Filter the first four columns.
+	MVN	r12,r8			@ r12= ~p2
+	UHADD8	r14,r14,r8		@ r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		@ r12= p1+~p2>>1
+	MVN	r10, r10		@ r10=~p3
+	MVN	r12,r12			@ r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		@ r14= m1=v1-u1
+	@ Single issue
+	EOR	r4, r4, r14		@ r4 = m1^p0
+	EOR	r10,r10,r14		@ r10= m1^~p3
+	UHADD8	r4, r4, r10		@ r4 = (m1^p0)+(m1^~p3)>>1
+	@ Single issue
+	EOR	r4, r4, r14		@ r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		@ r14= v1=m1+u1
+	UHADD8	r4, r4, r12		@ r4 = a1+u1>>1
+	MVN	r12,r9			@ r12= ~p6
+	UHADD8	r4, r4, r14		@ r4 = f1=(a1+u1>>1)+v1>>1
+	@ Filter the second four columns.
+	MVN	r14,r7			@ r14= ~p5
+	UHADD8	r12,r12,r7		@ r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		@ r14= v2=~p5+p6>>1
+	MVN	r12,r12			@ r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			@ r11=~p7
+	SSUB8	r10,r14,r12		@ r10= m2=v2-u2
+	@ Single issue
+	EOR	r5, r5, r10		@ r5 = m2^p4
+	EOR	r11,r11,r10		@ r11= m2^~p7
+	UHADD8	r5, r5, r11		@ r5 = (m2^p4)+(m2^~p7)>>1
+	@ Single issue
+	EOR	r5, r5, r10		@ r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	@ Single issue
+	UHADD8	r5, r5, r12		@ r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		@ r12 = {127}x4
+	UHADD8	r5, r5, r14		@ r5 = f2=(a2+u2>>1)+v2>>1
+	@ Now split f[i] by sign.
+	@ There's no min or max instruction.
+	@ We could use SSUB8 and SEL, but this is just as many instructions and
+	@  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		@ r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		@ r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		@ r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		@ r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		@ r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		@ r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		@ r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		@ r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		@ r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		@ r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		@ r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		@ r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		@ r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		@ [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		@ r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		@ [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+	.size loop_filter_v_v6, .-loop_filter_v_v6	@ ENDP
+
+	.type	oc_loop_filter_frag_rows_v6, %function; oc_loop_filter_frag_rows_v6: @ PROC
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	@ ll = *(int *)_bv
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	@			  bail
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_v6_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		@ if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	.size oc_loop_filter_frag_rows_v6, .-oc_loop_filter_frag_rows_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	oc_loop_filter_init_neon
+	.global	oc_loop_filter_frag_rows_neon
+
+	.type	oc_loop_filter_init_neon, %function; oc_loop_filter_init_neon: @ PROC
+	@ r0 = _bv
+	@ r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  @ r1 = 2*L
+	VDUP.S16	Q15, r1		@ Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0,:128]
+	MOV	PC,r14
+	.size oc_loop_filter_init_neon, .-oc_loop_filter_init_neon	@ ENDP
+
+	.type	loop_filter_h_neon, %function; loop_filter_h_neon: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	@ We assume Q15= 2*L in U16s
+	@                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	@ Doing a 2-element structure load saves doing two VTRN's below, at the
+	@  cost of using two more slower single-lane loads vs. the faster
+	@  all-lane loads.
+	@ It's less code this way, though, and benches a hair faster, but it
+	@  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		@ D0 = ____________1100     2,1
+						@ D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		@ D4 = ____________5544     2,1
+						@ D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		@ D0 = ________99881100     3,1
+						@ D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		@ D4 = ________DDCC5544     3,1
+						@ D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		@ D0 = ____GGHH99881100     3,1
+						@ D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		@ D4 = ____KKLLDDCC5544     3,1
+						@ D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		@ D0 = PPOOGGHH99881100     3,1
+						@ D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		@ D4 = TTSSKKLLDDCC5544     3,1
+						@ D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	@ D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	@ D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	@ Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	@  We want to do
+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	@ So we've reduced the left and right hand terms to be the same, except
+	@ for a negation.
+	@ Stall x3
+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	@ Now we need to correct for the sign of f.
+	@ For negative elements of Q0, we want to subtract the appropriate
+	@ element of Q9. For positive elements we want to add them. No NEON
+	@ instruction exists to do this, so we need to negate the negative
+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	@				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
+	@ Bah. No VRSBW.U8
+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		@ D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		@ D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		@ D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+	.size loop_filter_h_neon, .-loop_filter_h_neon	@ ENDP
+
+	.type	loop_filter_v_neon, %function; loop_filter_v_neon: @ PROC
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	@ We assume Q15= 2*L in U16s
+	@                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12,:64], r1		@ D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12,:64], r1		@ D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12,:64], r1		@ D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12,:64]			@ D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	@ Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	@  We want to do
+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	@ So we've reduced the left and right hand terms to be the same, except
+	@ for a negation.
+	@ Stall x3
+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
+	@ Stall x2
+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	@ Stall x2
+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	@ Now we need to correct for the sign of f.
+	@ For negative elements of Q0, we want to subtract the appropriate
+	@ element of Q9. For positive elements we want to add them. No NEON
+	@ instruction exists to do this, so we need to negate the negative
+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	@ Stall x3
+	VADD.S16	Q9, Q9, Q0	@				    1,3
+	@ Stall x2
+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
+	@ Bah. No VRSBW.U8
+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		@ D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		@ D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12,:64], r1
+	VST1.64	{D4}, [r12,:64], r1
+	MOV	PC,r14
+	.size loop_filter_v_neon, .-loop_filter_v_neon	@ ENDP
+
+	.type	oc_loop_filter_frag_rows_neon, %function; oc_loop_filter_frag_rows_neon: @ PROC
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	@		  bail
+	VLD1.64	{D30,D31}, [r2,:128]	@ Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_neon_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	.size oc_loop_filter_frag_rows_neon, .-oc_loop_filter_frag_rows_neon	@ ENDP
+  .endif
+
+	@ END
+    .section	.note.GNU-stack,"",%progbits

+ 676 - 0
modules/theoraplayer/native/theora/lib/arm/armloop.s

@@ -0,0 +1,676 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG	*	1
+
+	; Vanilla ARM v4 version
+loop_filter_h_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp
+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
+	LDRB	r12,[r0, #1]		; r12= _pix[3]
+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+loop_filter_v_arm PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp
+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
+	LDRB	r12,[r0, r1]		; r12= _pix[3]
+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_arm PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	; _bv += 127
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_loop_filter_init_v6
+	EXPORT	oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6 PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+	ENDP
+
+; We could use the same strategy as the v filter below, but that would require
+;  40 instructions to load the data and transpose it into columns and another
+;  32 to write out the results at the end, plus the 52 instructions to do the
+;  filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+;  52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+;  proposed for FFmpeg, but not by much:
+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+;  of four.
+loop_filter_h_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+loop_filter_h_core_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; r12= 0x10003
+	; Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
+	; Single issue
+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
+	; Single issue
+	; There's no min, max or abs instruction.
+	; SSUB8 and SEL will work for abs, and we can do all the rest with
+	;  unsigned saturated adds, which means the GE flags are still all
+	;  set when we're done computing lflim(abs(R_i),L).
+	; This allows us to both add and subtract, and split the results by
+	;  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	; Single issue
+	SEL	r7, r7, r6		; r7 = abs(R_i)
+	; Single issue
+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
+	; Single issue
+	UQADD8	r7, r7, r4
+	; Single issue
+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	; Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		; r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		; r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		; r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		; r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		; r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		; r4 = q1
+	STRB	r4, [r0,#-1]
+	; Single issue
+	STRB	r9, [r0,-r1]!
+	; Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+	ENDP
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+;  filter value, f:
+;   u = ~UHADD8(p1,~p2);
+;   v = UHADD8(~p1,p2);
+;   m = v-u;
+;   a = m^UHADD8(m^p0,m^~p3);
+;   f = UHADD8(UHADD8(a,u1),v1);
+;  where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+;  as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+;  but requires more code, because it does all eight columns at once, instead
+;  of four at a time.
+loop_filter_v_v6 PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
+	MVN	r14,r6			; r14= ~p1
+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
+	; Filter the first four columns.
+	MVN	r12,r8			; r12= ~p2
+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
+	MVN	r10, r10		; r10=~p3
+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = m1^p0
+	EOR	r10,r10,r14		; r10= m1^~p3
+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		; r14= v1=m1+u1
+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
+	MVN	r12,r9			; r12= ~p6
+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
+	; Filter the second four columns.
+	MVN	r14,r7			; r14= ~p5
+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			; r11=~p7
+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
+	; Single issue
+	EOR	r5, r5, r10		; r5 = m2^p4
+	EOR	r11,r11,r10		; r11= m2^~p7
+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
+	; Single issue
+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	; Single issue
+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
+	; Now split f[i] by sign.
+	; There's no min or max instruction.
+	; We could use SSUB8 and SEL, but this is just as many instructions and
+	;  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+	ENDP
+
+oc_loop_filter_frag_rows_v6 PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	; ll = *(int *)_bv
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		; if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_loop_filter_init_neon
+	EXPORT	oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon PROC
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  ; r1 = 2*L
+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0@128]
+	MOV	PC,r14
+	ENDP
+
+loop_filter_h_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	; Doing a 2-element structure load saves doing two VTRN's below, at the
+	;  cost of using two more slower single-lane loads vs. the faster
+	;  all-lane loads.
+	; It's less code this way, though, and benches a hair faster, but it
+	;  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
+						; D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
+						; D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
+						; D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
+						; D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
+						; D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
+						; D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
+						; D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
+						; D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+	ENDP
+
+loop_filter_v_neon PROC
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12@64], r1		; D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12@64], r1		; D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12@64], r1		; D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12@64]			; D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12@64], r1
+	VST1.64	{D4}, [r12@64], r1
+	MOV	PC,r14
+	ENDP
+
+oc_loop_filter_frag_rows_neon PROC
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	;		  bail
+	VLD1.64	{D30,D31}, [r2@128]	; Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
+ ]
+
+	END

+ 39 - 0
modules/theoraplayer/native/theora/lib/arm/armopts-gnu.s

@@ -0,0 +1,39 @@
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
+@********************************************************************
+
+@ Set the following to 1 if we have EDSP instructions
+@  (LDRD/STRD, etc., ARMv5E and later).
+ .set OC_ARM_ASM_EDSP,	1
+
+@ Set the following to 1 if we have ARMv6 media instructions.
+ .set OC_ARM_ASM_MEDIA,	1
+
+@ Set the following to 1 if we have NEON (some ARMv7)
+ .set OC_ARM_ASM_NEON,	1
+
+@ Set the following to 1 if LDR/STR can work on unaligned addresses
+@ This is assumed to be true for ARMv6 and later code
+ .set OC_ARM_CAN_UNALIGN,	1
+
+@ Large unaligned loads and stores are often configured to cause an exception.
+@ They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+@  boundary, so it's usually a bad idea to use them anyway if they can be
+@  avoided.
+
+@ Set the following to 1 if LDRD/STRD can work on unaligned addresses
+ .set OC_ARM_CAN_UNALIGN_LDRD,	0
+
+@ END:

+ 39 - 0
modules/theoraplayer/native/theora/lib/arm/armopts.s

@@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	0
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	0
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	0
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END

+ 39 - 0
modules/theoraplayer/native/theora/lib/arm/armopts.s.in

@@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	@HAVE_ARM_ASM_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	@HAVE_ARM_ASM_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	@HAVE_ARM_ASM_NEON@
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END

+ 219 - 0
modules/theoraplayer/native/theora/lib/arm/armstate.c

@@ -0,0 +1,219 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
+  /*Note: We _must_ set this function pointer, because the macro in armint.h
+     calls it with different arguments, so the C version will segfault.*/
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+#    endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+  }
+#   endif
+#  endif
+# endif
+}
+
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+#  endif
+# endif
+
+#endif

+ 236 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armbits.asm

@@ -0,0 +1,236 @@
+#ifdef OC_ARM_ASM
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@
+@ function:
+@   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
+@
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.global _oc_pack_read_arm
+	.global _oc_pack_read1_arm
+	.global _oc_huff_token_decode_arm
+
+	@ .type oc_pack_read1_arm, %function; oc_pack_read1_arm: @ PROC
+_oc_pack_read1_arm:
+	@ r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      @ r2 = window
+	@ Stall...             ; r3 = available
+	@ Stall...
+	SUBS r3,r3,#1          @ r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      @ r0 = window>>31
+	MOV r2,r2,LSL #1       @ r2 = window<<=1
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	MOV PC,r14
+	@ .size oc_pack_read1_arm, .-oc_pack_read1_arm	@ ENDP
+
+	@ .type oc_pack_read_arm, %function; oc_pack_read_arm: @ PROC
+_oc_pack_read_arm:
+	@ r0 = oc_pack_buf *_b
+	@ r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      @ r2 = window
+	@ Stall...             ; r3 = available
+	@ Stall...
+	SUBS r3,r3,r1          @ r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          @ r0 = 32-_bits
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	MOV PC,r14
+
+@ We need to refill window.
+oc_pack_read1_refill:
+	MOV r1,#1
+oc_pack_read_refill:
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     @ r10 = stop
+	                       @ r11 = ptr
+	RSB r0,r1,#32          @ r0 = 32-_bits
+	RSB r3,r3,r0           @ r3 = 32-available
+@ We can use unsigned compares for both the pointers and for available
+@  (allowing us to chain condition codes) because available will never be
+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+@  negative.
+	CMP r10,r11            @ ptr<stop => HI
+	CMPHI r3,#7            @   available<=24 => HI
+	LDRBHI r14,[r11],#1    @     r14 = *ptr++
+	SUBHI r3,#8            @     available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @     r2 = window|=r14<<32-available
+	CMPHI r10,r11          @     ptr<stop => HI
+	CMPHI r3,#7            @       available<=24 => HI
+	LDRBHI r14,[r11],#1    @         r14 = *ptr++
+	SUBHI r3,#8            @         available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @         r2 = window|=r14<<32-available
+	CMPHI r10,r11          @         ptr<stop => HI
+	CMPHI r3,#7            @           available<=24 => HI
+	LDRBHI r14,[r11],#1    @             r14 = *ptr++
+	SUBHI r3,#8            @             available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @             r2 = window|=r14<<32-available
+	CMPHI r10,r11          @             ptr<stop => HI
+	CMPHI r3,#7            @               available<=24 => HI
+	LDRBHI r14,[r11],#1    @                 r14 = *ptr++
+	SUBHI r3,#8            @                 available += 8
+	@ (HI) Stall...
+	ORRHI r2,r14,LSL r3    @                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          @ r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STR r11,[r12,#-4]      @ ptr = r11
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+@ Either we wanted to read more than 24 bits and didn't have enough room to
+@  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last:
+	CMP r11,r10            @ ptr<stop => LO
+@ If we didn't hit the end of the packet, then pull enough of the next byte to
+@  to fill up the window.
+	LDRBLO r14,[r11]       @ (LO) r14 = *ptr
+@ Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           @ (HS) r14 = 1
+	ADDLO r10,r3,r1        @ (LO) r10 = available
+	STRHS r14,[r12,#8]     @ (HS) eof = 1
+	ANDLO r10,r10,#7       @ (LO) r10 = available0x7
+	MOVHS r3,#1<<30        @ (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   @ (LO) r2 = window|=*ptr>>(available0x7)
+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
+	STR r11,[r12,#-4]      @ ptr = r11
+	STMIA r12,{r2,r3}      @ window = r2
+	                       @ available = r3
+	LDMFD r13!,{r10,r11,PC}
+	@ .size oc_pack_read_arm, .-oc_pack_read_arm	@ ENDP
+
+
+
+	@ .type oc_huff_token_decode_arm, %function; oc_huff_token_decode_arm: @ PROC
+_oc_huff_token_decode_arm:
+	@ r0 = oc_pack_buf       *_b
+	@ r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         @ r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       @ r2 = stop
+	@ Stall...             ; r3 = ptr
+	@ Stall...             ; r4 = window
+	                       @ r5 = available
+	CMP r10,r5             @ n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  @ r14 = _tree+bits
+	LDRSH r12,[r14,#2]     @ r12 = node=_tree[1+bits]
+	@ Stall...
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+@ The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue:
+	ADD r12,r1,r12,LSL #1  @ r12 = _tree+node
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r5,r10          @ r5 = available-=n
+	LDRSH r10,[r12],#2     @ r10 = n=_tree[node]
+	@ Stall...             ; r12 = _tree+node+1
+	@ Stall...
+	CMP r10,r5             @ n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r12,r12,r14        @
+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
+	@ Stall...
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0:
+	ADD r12,r1,#2          @ r12 = _tree+1
+oc_huff_token_decode_refill:
+@ We can't possibly need more than 15 bits, so available must be <= 15.
+@ Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              @ ptr<stop => HI
+	LDRBHI r14,[r3],#1     @   r14 = *ptr++
+	RSBHI r5,r5,#24        @ (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        @ (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    @   r4 = window|=r14<<32-available
+	CMPHI r2,r3            @   ptr<stop => HI
+	LDRBHI r14,[r3],#1     @     r14 = *ptr++
+	SUBHI r5,#8            @     available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @     r4 = window|=r14<<32-available
+@ We can use unsigned compares for both the pointers and for available
+@  (allowing us to chain condition codes) because available will never be
+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+@  negative.
+	CMPHI r2,r3            @     ptr<stop => HI
+	CMPHI r5,#7            @       available<=24 => HI
+	LDRBHI r14,[r3],#1     @         r14 = *ptr++
+	SUBHI r5,#8            @         available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @         r4 = window|=r14<<32-available
+	CMP r2,r3              @ ptr<stop => HI
+	MOVLS r5,#-1<<30       @ (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            @ (HI) available<=24 => HI
+	LDRBHI r14,[r3],#1     @ (HI)   r14 = *ptr++
+	SUBHI r5,#8            @ (HI)   available += 8
+	@ (HI) Stall...
+	ORRHI r4,r14,LSL r5    @ (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        @ r14 = 32-n
+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
+	ADD r12,r12,r14        @
+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          @ r5 = available
+	@ Stall...
+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
+	MOV r4,r4,LSL r10      @ r4 = window<<=n
+	SUB r5,r10             @ r5 = available-=n
+	STMIB r0,{r3-r5}       @ ptr = r3
+	                       @ window = r4
+	                       @ available = r5
+	AND r0,r14,#255        @ r0 = node0x255
+	LDMFD r13!,{r4,r5,r10,pc}
+	@ .size oc_huff_token_decode_arm, .-oc_huff_token_decode_arm	@ ENDP
+
+	@ END
+    @ .section	.note.GNU-stack,"",%progbits
+#endif

+ 32 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armbits.h

@@ -0,0 +1,32 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_pack_read oc_pack_read_arm
+#  define oc_pack_read1 oc_pack_read1_arm
+#  define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif

+ 127 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.c

@@ -0,0 +1,127 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_MEDIA)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  flags=0;
+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
+     instructions via their assembled hex code.
+    All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OC_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OC_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OC_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t  flags;
+  FILE         *fin;
+  flags=0;
+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+     Android.
+    This also means that detection will fail in Scratchbox.*/
+  fin=fopen("/proc/cpuinfo","r");
+  if(fin!=NULL){
+    /*512 should be enough for anybody (it's even enough for all the flags that
+       x86 has accumulated... so far).*/
+    char buf[512];
+    while(fgets(buf,511,fin)!=NULL){
+      if(memcmp(buf,"Features",8)==0){
+        char *p;
+        p=strstr(buf," edsp");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+        p=strstr(buf," neon");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+      }
+      if(memcmp(buf,"CPU architecture:",17)==0){
+        int version;
+        version=atoi(buf+17);
+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
+
+#elif defined(_IOS)
+
+ogg_uint32_t oc_cpu_flags_get(void){
+	ogg_uint32_t flags;
+	flags=0;
+	flags|=OC_CPU_ARM_EDSP;
+	flags|=OC_CPU_ARM_MEDIA;
+	flags|=OC_CPU_ARM_NEON;
+	return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+   accessible in priveleged modes only, so we can't have a general user-space
+   detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform.  Reconfigure with --disable-asm (or send patches)."
+#endif

+ 29 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.h

@@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA    (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP     (1<<7)
+#define OC_CPU_ARM_NEON     (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

+ 57 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armenc.c

@@ -0,0 +1,57 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armenc.h"
+
+#if defined(OC_ARM_ASM)
+
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
+  /*TODO: Add ARMv4 functions here.*/
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add EDSP functions here.*/
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add Media functions here.*/
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
+#    endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+  }
+#   endif
+#  endif
+# endif
+}
+#endif

+ 51 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armenc.h

@@ -0,0 +1,51 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armenc_H)
+# define _arm_armenc_H (1)
+# include "armint.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_arm
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+
+# include "../encint.h"
+
+# if defined(OC_ARM_ASM)
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
+
+#  if defined(OC_ARM_ASM_EDSP)
+#   if defined(OC_ARM_ASM_MEDIA)
+#    if defined(OC_ARM_ASM_NEON)
+unsigned oc_enc_frag_satd_neon(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_neon(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_neon(int *_dc,
+ const unsigned char *_src,int _ystride);
+
+void oc_enc_enquant_table_init_neon(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

+ 668 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armfrag.asm

@@ -0,0 +1,668 @@
+#ifdef OC_ARM_ASM
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+@ Vanilla ARM v4 versions
+	.global	_oc_frag_copy_list_arm
+	.global	_oc_frag_recon_intra_arm
+	.global	_oc_frag_recon_inter_arm
+	.global	_oc_frag_recon_inter2_arm
+
+	@ .type oc_frag_copy_list_arm, %function; oc_frag_copy_list_arm: @ PROC
+_oc_frag_copy_list_arm:
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		@ r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp:
+	LDR	r11,[r14,r4,LSL #2]	@ r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	@ Stall (on XScale)
+	ADD	r4, r1, r11		@ r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		@ r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		@ r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end:
+	LDMFD	r13!,{r4-r6,r11,PC}
+_oc_frag_recon_intra_arm:
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm:
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+	@ .size oc_frag_copy_list_arm, .-oc_frag_copy_list_arm	@ ENDP
+
+	@ .type oc_frag_recon_inter_arm, %function; oc_frag_recon_inter_arm: @ PROC
+_oc_frag_recon_inter_arm:
+	@ r0 =       unsigned char *dst
+	@ r1 = const unsigned char *src
+	@ r2 =       int            ystride
+	@ r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm:
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+	@ .size oc_frag_recon_inter_arm, .-oc_frag_recon_inter_arm	@ ENDP
+
+	@ .type oc_frag_recon_inter2_arm, %function; oc_frag_recon_inter2_arm: @ PROC
+_oc_frag_recon_inter2_arm:
+	@ r0 =       unsigned char *dst
+	@ r1 = const unsigned char *src1
+	@ r2 = const unsigned char *src2
+	@ r3 =       int            ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm:
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+	@ .size oc_frag_recon_inter2_arm, .-oc_frag_recon_inter2_arm	@ ENDP
+
+  .if OC_ARM_ASM_EDSP
+	.global	_oc_frag_copy_list_edsp
+
+	@ .type oc_frag_copy_list_edsp, %function; oc_frag_copy_list_edsp: @ PROC
+_oc_frag_copy_list_edsp:
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		@ r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp:
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	@ r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	@ Stall (on XScale)
+	LDRD	r6, r7, [r4, r5]!		@ r4 = _src_frame+frag_buf_off
+	LDRD	r8, r9, [r4, r2]!
+	@ Stall
+	STRD	r6, r7, [r5, r0]!		@ r5 = _dst_frame+frag_buf_off
+	STRD	r8, r9, [r5, r2]!
+	@ Stall
+	LDRD	r6, r7, [r4, r2]!	@ On Xscale at least, doing 3 consecutive
+	LDRD	r8, r9, [r4, r2]!	@ loads causes a stall, but thats no worse
+	LDRD	r10,r11,[r4, r2]!	@ than us only doing 2, and having to do
+				@ another pair of LDRD/STRD later on.
+	@ Stall
+	STRD	r6, r7, [r5, r2]!
+	STRD	r8, r9, [r5, r2]!
+	STRD	r10,r11,[r5, r2]!
+	LDRD	r6, r7, [r4, r2]!
+	LDRD	r8, r9, [r4, r2]!
+	LDRD	r10,r11,[r4, r2]!
+	STRD	r6, r7, [r5, r2]!
+	STRD	r8, r9, [r5, r2]!
+	STRD	r10,r11,[r5, r2]!
+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end:
+	LDMFD	r13!,{r4-r11,PC}
+	@ .size oc_frag_copy_list_edsp, .-oc_frag_copy_list_edsp	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_MEDIA
+	.global	_oc_frag_recon_intra_v6
+	.global	_oc_frag_recon_inter_v6
+	.global	_oc_frag_recon_inter2_v6
+
+	@ .type oc_frag_recon_intra_v6, %function; oc_frag_recon_intra_v6: @ PROC
+_oc_frag_recon_intra_v6:
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	MOV	r6, #0x0080
+	MOVT	r6, #0x0080
+ofrintra_v6_lp:
+	LDRD	r2, r3, [r12],#8	@ r2 = 11110000 r3 = 33332222
+	LDRD	r4, r5, [r12],#8	@ r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		@ r2 = __11__00
+	USAT16	r3, #8, r3		@ r3 = __33__22
+	USAT16	r4, #8, r4		@ r4 = __55__44
+	USAT16	r5, #8, r5		@ r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	@ r2 = __111100
+	ORR	r3, r3, r3, LSR #8	@ r3 = __333322
+	ORR	r4, r4, r4, LSR #8	@ r4 = __555544
+	ORR	r5, r5, r5, LSR #8	@ r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     @ r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     @ r3 = 77665544
+	STRD	r2, r3, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+	@ .size oc_frag_recon_intra_v6, .-oc_frag_recon_intra_v6	@ ENDP
+
+	@ .type oc_frag_recon_inter_v6, %function; oc_frag_recon_inter_v6: @ PROC
+_oc_frag_recon_inter_v6:
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src
+	@ r2 =       int            _ystride
+	@ r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp:
+	LDRD	r6, r7, [r3], #8		@ r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+  .if OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, r5, [r1], r2	@ Unaligned ; r4 = 33221100 r5 = 77665544
+  .else
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+  .endif
+	PKHBT	r12,r6, r7, LSL #16	@ r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
+	UXTB16	r6,r4			@ r6 = __22__00
+	UXTB16	r4,r4, ROR #8		@ r4 = __33__11
+	QADD16	r12,r12,r6		@ r12= xx22xx00
+	QADD16	r4, r7, r4		@ r4 = xx33xx11
+	LDRD	r6, r7, [r3], #8		@ r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		@ r4 = __33__11
+	USAT16	r12,#8,r12		@ r12= __22__00
+	ORR	r4, r12,r4, LSL #8	@ r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	@ r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 77775555
+	UXTB16	r6,r5			@ r6 = __66__44
+	UXTB16	r5,r5, ROR #8		@ r5 = __77__55
+	QADD16	r12,r12,r6		@ r12= xx66xx44
+	QADD16	r5, r7, r5		@ r5 = xx77xx55
+	USAT16	r12,#8, r12		@ r12= __66__44
+	USAT16	r5, #8, r5		@ r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	@ r5 = 33221100
+	STRD	r4, r5, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+	@ .size oc_frag_recon_inter_v6, .-oc_frag_recon_inter_v6	@ ENDP
+
+	@ .type oc_frag_recon_inter2_v6, %function; oc_frag_recon_inter2_v6: @ PROC
+_oc_frag_recon_inter2_v6:
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src1
+	@ r2 = const unsigned char *_src2
+	@ r3 =       int            _ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp:
+	LDRD	r6, r7, [r12,#8]	@ r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	@ Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	@ Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	@ r9 = 77775555
+	UHADD8	r4, r4, r5	@ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			@ r5 = __66__44
+	UXTB16	r4, r4, ROR #8		@ r4 = __77__55
+	QADD16	r8, r8, r5		@ r8 = xx66xx44
+	QADD16	r9, r9, r4		@ r9 = xx77xx55
+	LDRD	r6, r7, [r12],#16	@ r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		@ r8 = __66__44
+	LDR	r4, [r1], r3	@ Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		@ r9 = __77__55
+	LDR	r5, [r2], r3	@ Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	@ r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 22220000
+	UHADD8	r4, r4, r5	@ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
+	UXTB16	r5, r4			@ r5 = __22__00
+	UXTB16	r4, r4, ROR #8		@ r4 = __33__11
+	QADD16	r8, r8, r5		@ r8 = xx22xx00
+	QADD16	r7, r7, r4		@ r7 = xx33xx11
+	USAT16	r8, #8, r8		@ r8 = __22__00
+	USAT16	r7, #8, r7		@ r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	@ r8 = 33221100
+	STRD	r8, r9, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+	@ .size oc_frag_recon_inter2_v6, .-oc_frag_recon_inter2_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	_oc_frag_copy_list_neon
+	.global	_oc_frag_recon_intra_neon
+	.global	_oc_frag_recon_inter_neon
+	.global	_oc_frag_recon_inter2_neon
+
+	@ .type oc_frag_copy_list_neon, %function; oc_frag_copy_list_neon: @ PROC
+_oc_frag_copy_list_neon:
+	@ r0 = _dst_frame
+	@ r1 = _src_frame
+	@ r2 = _ystride
+	@ r3 = _fragis
+	@ <> = _nfragis
+	@ <> = _frag_buf_offs
+	LDR	r12,[r13]		@ r12 = _nfragis
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		@ r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	@ Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
+	@ Stall (on XScale)
+	MOV	r7, r6			@ Guarantee PLD points somewhere valid.
+ofcl_neon_lp:
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4,:64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D1}, [r4,:64], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D2}, [r4,:64], r2
+	LDRGT	r6, [r3,#4]!		@ r6 = _fragis[fragii]
+	VLD1.64	{D3}, [r4,:64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
+	VLD1.64	{D4}, [r4,:64], r2
+	ADDGT	r7, r1, r6
+	VLD1.64	{D5}, [r4,:64], r2
+	PLD	[r7]
+	VLD1.64	{D6}, [r4,:64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D7}, [r4,:64]
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D0}, [r5,:64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D1}, [r5,:64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D2}, [r5,:64], r2
+	PLD	[r7]
+	VST1.64	{D3}, [r5,:64], r2
+	PLD	[r7, r2]
+	VST1.64	{D4}, [r5,:64], r2
+	PLD	[r7, r2, LSL #1]
+	VST1.64	{D5}, [r5,:64], r2
+	ADDGT	r7, r7, r2, LSL #2
+	VST1.64	{D6}, [r5,:64], r2
+	PLD	[r7, -r2]
+	VST1.64	{D7}, [r5,:64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end:
+	LDMFD	r13!,{r4-r7,PC}
+	@ .size oc_frag_copy_list_neon, .-oc_frag_copy_list_neon	@ ENDP
+
+	@ .type oc_frag_recon_intra_neon, %function; oc_frag_recon_intra_neon: @ PROC
+_oc_frag_recon_intra_neon:
+	@ r0 =       unsigned char *_dst
+	@ r1 =       int            _ystride
+	@ r2 = const ogg_int16_t    _residue[64]
+	VMOV.I16	Q0, #128
+	VLDMIA	r2,  {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	@ D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	@ D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	@ D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0,:64], r1
+	VQMOVUN.S16	D19,Q11	@ D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0,:64], r1
+	VQMOVUN.S16	D20,Q12	@ D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0,:64], r1
+	VQMOVUN.S16	D21,Q13	@ D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0,:64], r1
+	VQMOVUN.S16	D22,Q14	@ D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0,:64], r1
+	VQMOVUN.S16	D23,Q15	@ D23= !!,:@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0,:64], r1
+	VST1.64	{D22},[r0,:64], r1
+	VST1.64	{D23},[r0,:64], r1
+	MOV	PC,R14
+	@ .size oc_frag_recon_intra_neon, .-oc_frag_recon_intra_neon	@ ENDP
+
+	@ .type oc_frag_recon_inter_neon, %function; oc_frag_recon_inter_neon: @ PROC
+_oc_frag_recon_inter_neon:
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src
+	@ r2 =       int            _ystride
+	@ r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	@ Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	@ etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0,:64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0,:64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0,:64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0,:64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0,:64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0,:64], r2
+	VST1.64	{D22},[r0,:64], r2
+	VST1.64	{D23},[r0,:64], r2
+	MOV	PC,R14
+	@ .size oc_frag_recon_inter_neon, .-oc_frag_recon_inter_neon	@ ENDP
+
+	@ .type oc_frag_recon_inter2_neon, %function; oc_frag_recon_inter2_neon: @ PROC
+_oc_frag_recon_inter2_neon:
+	@ r0 =       unsigned char *_dst
+	@ r1 = const unsigned char *_src1
+	@ r2 = const unsigned char *_src2
+	@ r3 =       int            _ystride
+	LDR	r12,[r13]
+	@ r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	@ Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		@ Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		@ etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0,:64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0,:64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0,:64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0,:64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0,:64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0,:64], r3
+	VST1.64	{D22},[r0,:64], r3
+	VST1.64	{D23},[r0,:64], r3
+	MOV	PC,R14
+	@ .size oc_frag_recon_inter2_neon, .-oc_frag_recon_inter2_neon	@ ENDP
+  .endif
+
+	@ END
+    @ .section	.note.GNU-stack,"",%progbits
+#endif

+ 1886 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armidct.asm

@@ -0,0 +1,1886 @@
+#ifdef OC_ARM_ASM
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.global	_oc_idct8x8_1_arm
+	.global	_oc_idct8x8_arm
+
+	@ .type oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
+_oc_idct8x8_1_arm:
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+	@ .size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm	@ ENDP
+
+	@ .type oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
+_oc_idct8x8_arm:
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	@ Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	SUB	r2, r1, #8*16
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	MOV	r1, r13		@ And read from temp storage.
+@ Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	@ .size oc_idct8x8_arm, .-oc_idct8x8_arm	@ ENDP
+
+	@ .type oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
+oc_idct8x8_10_arm:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-4*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#20]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+oc_idct8x8_10_arm_cols:
+@ Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	@ .size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm	@ ENDP
+
+	@ .type oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
+oc_idct8x8_6_arm:
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-3*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+@ Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	@ .size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm	@ ENDP
+
+	@ .type oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
+oc_idct8x8_3_arm:
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r2		@ Write to the final destination
+@ Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	@ .size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm	@ ENDP
+
+	@ .type idct1core_arm, %function; idct1core_arm: @ PROC
+idct1core_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	@ Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+	@ .size idct1core_arm, .-idct1core_arm	@ ENDP
+
+	@ .type idct2core_arm, %function; idct2core_arm: @ PROC
+idct2core_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		@ r3 = t[4]
+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		@ r10= t[5]
+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]
+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]
+	ADD	r11,r11,r9		@ r11= t[0]+t[7]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		@ y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		@ y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		@ y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	@ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	@ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	@ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	@ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
+	MOV	PC,r14
+	@ .size idct2core_arm, .-idct2core_arm	@ ENDP
+
+	@ .type idct2core_down_arm, %function; idct2core_down_arm: @ PROC
+idct2core_down_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MOV	r3, r3, ASR #16		@ r3 = t[4]
+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		@ r10= t[5]
+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		@ r11= t[0]+t[7]+8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	@r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	@r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	@r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	@r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		@ y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		@ y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
+	MOV	PC,r14
+	@ .size idct2core_down_arm, .-idct2core_down_arm	@ ENDP
+
+	@ .type idct3core_arm, %function; idct3core_arm: @ PROC
+idct3core_arm:
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]
+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		@ r4 = t[4]
+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	@ r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2] = t[0]-t[2]
+					@ r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		@ r12= t[6]
+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		@ r11= t2[0]+t[7]
+	ADD	r5, r10,r5		@ r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		@ r12= t[2]+t2[5]
+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	@ r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	@ r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	@ r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	@ r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	@ .size idct3core_arm, .-idct3core_arm	@ ENDP
+
+	@ .type idct3core_down_arm, %function; idct3core_down_arm: @ PROC
+idct3core_down_arm:
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		@ r4 = t[4]
+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		@ r11= t[7]
+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	@ r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2]+8 = t[0]-t[2]+8
+					@ r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		@ r12= t[6]
+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		@ r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		@ r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		@ r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	@ r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	@ r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	@ r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	@ r4 = t2[3] - t[4]  + 8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,R14
+	@ .size idct3core_down_arm, .-idct3core_down_arm	@ ENDP
+
+	@ .type idct4core_arm, %function; idct4core_arm: @ PROC
+idct4core_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		@ r6 = t[4]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		@ r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		@ r11= t[0]+t2[7]
+	ADD	r6, r4, r6		@ r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		@ r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		@ r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		@ y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	@ r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	@ r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	@ r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	@ r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	@ .size idct4core_arm, .-idct4core_arm	@ ENDP
+
+	@ .type idct4core_down_arm, %function; idct4core_down_arm: @ PROC
+idct4core_down_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		@ r9 = x[0]
+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		@ r9 = t[0]
+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		@ r6 = t[4]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		@ r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
+	ADD	r9, r9, #8		@ r9 = t[0]+8
+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		@ r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		@ r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		@ r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		@ r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	@ r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	@ r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	@ r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	@ r7 = t[3]-t2[4]+8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		@ y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		@ y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		@ y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
+	MOV	PC,r14
+	@ .size idct4core_down_arm, .-idct4core_down_arm	@ ENDP
+
+	@ .type idct8core_arm, %function; idct8core_arm: @ PROC
+idct8core_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		@ r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
+	LDR	r12,OC_C4S4		@ r12= C4S4
+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
+	@ For spec compliance, these sums must be truncated to 16-bit precision
+	@ _before_ the multiply (not after).
+	@ Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	@ r10=t[6] r12=C4S4 r14=t[5]
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	@ Stage 2
+	@ 4-5 butterfly
+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	@ 7-6 butterfly
+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	@ Stage 3
+	@ 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3]
+	@ 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2]
+	@ 6-5 butterfly
+	MOV	r14,r14,ASR #16		@ r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	@ r10=t3[6] r14=t3[5]
+	@ Stage 4
+	ADD	r2, r2, r8		@ r2 = t[0] + t[7]
+	ADD	r6, r6, r10		@ r6 = t[1] + t[6]
+	ADD	r3, r3, r14		@ r3 = t[2] + t[5]
+	ADD	r4, r4, r9		@ r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	@ .size idct8core_arm, .-idct8core_arm	@ ENDP
+
+	@ .type idct8core_down_arm, %function; idct8core_down_arm: @ PROC
+idct8core_down_arm:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		@ r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
+	LDR	r12,OC_C4S4		@ r12= C4S4
+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
+	@ For spec compliance, these sums must be truncated to 16-bit precision
+	@ _before_ the multiply (not after).
+	@ Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	@ r10=t[6] r12=C4S4 r14=t[5]
+	@ Stage 2
+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+@ before multiplying, not after (this is not equivalent)
+	@ 4-5 butterfly
+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	@ 7-6 butterfly
+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	@ Stage 3
+	ADD	r2, r2, #8<<16		@ r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		@ r6 = t[1]+8<<16
+	@ 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3] + 8
+	@ 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2] + 8
+	@ 6-5 butterfly
+	MOV	r14,r14,ASR #16		@ r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	@ r10=t3[6] r14=t3[5]
+	@ Stage 4
+	ADD	r2, r2, r8		@ r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		@ r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		@ r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		@ r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4] + 8
+	@ TODO: This is wrong.
+	@ The C code truncates to 16 bits by storing to RAM and doing the
+	@  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+	@ .size idct8core_down_arm, .-idct8core_down_arm	@ ENDP
+
+  .if OC_ARM_ASM_MEDIA
+	.global	_oc_idct8x8_1_v6
+	.global	_oc_idct8x8_v6
+
+	@ .type oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
+_oc_idct8x8_1_v6:
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	STRD	r2, r3, [r0], #8
+	MOV	PC, r14
+	@ .size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6	@ ENDP
+
+	@ .type oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
+_oc_idct8x8_v6:
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	@CMP	r2, #6
+	@BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	@ Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, r5, [r1,#-8*16]!
+	STRD	r4, r5, [r1,#8]
+	STRD	r4, r5, [r1,#16]
+	STRD	r4, r5, [r1,#24]
+	STRD	r4, r5, [r1,#32]
+	STRD	r4, r5, [r1,#40]
+	STRD	r4, r5, [r1,#48]
+	STRD	r4, r5, [r1,#56]
+	STRD	r4, r5, [r1,#64]
+	STRD	r4, r5, [r1,#72]
+	STRD	r4, r5, [r1,#80]
+	STRD	r4, r5, [r1,#88]
+	STRD	r4, r5, [r1,#96]
+	STRD	r4, r5, [r1,#104]
+	STRD	r4, r5, [r1,#112]
+	STRD	r4, r5, [r1,#120]
+	MOV	r1, r13		@ And read from temp storage.
+@ Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+	@ .size oc_idct8x8_v6, .-oc_idct8x8_v6	@ ENDP
+
+	@ .type oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
+oc_idct8x8_10_v6:
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+@ Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	@ Align the stack.
+	ADD	r0, r0, r2	@ Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	@ Write to the final destination.
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, r5, [r1,#-4*16]!
+	STRD	r4, r5, [r1,#16]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	AND	r1, r13,#4	@ Align the stack.
+	ADD	r1, r1, r13	@ And read from temp storage.
+@ Column transforms
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+	@ .size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6	@ ENDP
+
+	@ .type oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
+oc_idct8x8_3_v6:
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+@ Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		@ Write to temp storage.
+	BL	idct2_1core_v6
+	@ Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
+	MOV	r1, r13		@ Read from temp storage.
+	MOV	r0, r8		@ Write to the final destination.
+@ Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+	@ .size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6	@ ENDP
+
+	@ .type idct2_1core_v6, %function; idct2_1core_v6: @ PROC
+idct2_1core_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		@ r6 = x[1,0]
+	SMULWB	r12,r3, r2		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, r5, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		@ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+@ Stage 2:
+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		@ r7 = <0|t[0,7]>
+@ Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	@ r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		@ r4 = <0|t[0,4]>
+	SADDSUBX	r5, r5, r5		@ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+@ Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	@ r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		@ r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		@ r3 = t[0]+t[7]
+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		@ r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		@ r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		@ r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		@ y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		@ y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		@ y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	@ .size idct2_1core_v6, .-idct2_1core_v6	@ ENDP
+  .endif
+
+	.balign 8
+OC_C7S1:
+	.word	12785 @ 31F1
+OC_C1S7:
+	.word	64277 @ FB15
+OC_C6S2:
+	.word	25080 @ 61F8
+OC_C2S6:
+	.word	60547 @ EC83
+OC_C5S3:
+	.word	36410 @ 8E3A
+OC_C3S5:
+	.word	54491 @ D4DB
+OC_C4S4:
+	.word	46341 @ B505
+
+  .if OC_ARM_ASM_MEDIA
+	@ .type idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
+idct2_2core_down_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			@ r7  = 8
+	LDR	r6, [r1], #16		@ r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, r5, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		@ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		@ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+@ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	@ r7 = <t[0,7]|t[0,7]>
+@ Stage 2:
+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	@ r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		@ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		@ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	@ r2 = <t[1,5]|t[0,5]>
+@ Stage 3:
+	SSUB16	r5, r6, r2		@ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		@ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+@ Stage 4:
+	SADD16	r2, r12,r7		@ r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		@ r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		@ r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		@ r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	@ r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		@ y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	@ r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		@ y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	@ r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		@ y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	@ r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	@ .size idct2_2core_down_v6, .-idct2_2core_down_v6	@ ENDP
+
+@ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+@  pay for increased branch mis-prediction to get here, but in practice it
+@  doesn't seem to slow anything down to take it out, and it's less code this
+@  way.
+  .if 0
+	@ .type oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
+_oc_idct8x8_6_v6:
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+@ Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	@ Align the stack.
+	ADD	r0, r0, r13	@ Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	@ Clear input data for next block.
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, r5, [r1,#-3*16]!
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	AND	r1, r13,#4	@ Align the stack.
+	MOV	r0, r8		@ Write to the final destination.
+	ADD	r1, r1, r13	@ And read from temp storage.
+@ Column transforms
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+	@ .size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6	@ ENDP
+
+	@ .type idct1core_v6, %function; idct1core_v6: @ PROC
+_idct1core_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	@ Stall ?
+	MOV	r3, r3, ASR #16
+	@ Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+	@ .size idct1core_v6, .-idct1core_v6	@ ENDP
+
+	@ .type idct3_2core_v6, %function; idct3_2core_v6: @ PROC
+_idct3_2core_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r4, r5, [r1], #16		@ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,r11,OC_C6S2_3_v6	@ r10= OC_C6S2; r11= OC_C2S6
+	@ Stall
+	SMULWB	r3, r11,r5		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		@ r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, r7, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		@ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		@ r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		@ r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		@ r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		@ r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+@ Stage 2:
+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
+@ Stage 3:
+	B	idct4_3core_stage3_v6
+	@ .size idct3_2core_v6, .-idct3_2core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_3_v6:
+	.word	12785 @ 31F1
+OC_C1S7_3_v6:
+	.word	64277 @ FB15
+OC_C6S2_3_v6:
+	.word	25080 @ 61F8
+OC_C2S6_3_v6:
+	.word	60547 @ EC83
+
+	@ .type idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
+_idct3_3core_down_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,r11,[r1], #16		@ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, r7, OC_C6S2_3_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+@ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	@ r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	@ r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, r7, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
+@ Stage 3:
+	B	idct4_4core_down_stage3_v6
+	@ .size idct3_3core_down_v6, .-idct3_3core_down_v6	@ ENDP
+  .endif
+
+	@ .type idct4_3core_v6, %function; idct4_3core_v6: @ PROC
+idct4_3core_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,r11,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, r3, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, r5, [r1], #16		@ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		@ r9 = <0|t[0,6]>
+	LDRD	r6, r7, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		@ r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		@ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, r7, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
+@ Stage 3:
+idct4_3core_stage3_v6:
+	SADD16	r11,r12,r2		@ r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		@ r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6:
+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		@ r10= t[0]=t[0]+t[3]
+	SSUB16	r3, r12,r3		@ r3 = t[3]=t[0]-t[3]
+@ Stage 4:
+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]
+	STR	r12,[r0], #4		@ y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]
+	STR	r12,[r0, #12]		@ y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]
+	STR	r12,[r0, #28]		@ y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]
+	STR	r12,[r0, #44]		@ y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		@ y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		@ y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		@ y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+	@ .size idct4_3core_v6, .-idct4_3core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_4_v6:
+	.word	12785 @ 31F1
+OC_C1S7_4_v6:
+	.word	64277 @ FB15
+OC_C6S2_4_v6:
+	.word	25080 @ 61F8
+OC_C2S6_4_v6:
+	.word	60547 @ EC83
+OC_C5S3_4_v6:
+	.word	36410 @ 8E3A
+OC_C3S5_4_v6:
+	.word	54491 @ D4DB
+
+	@ .type idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
+idct4_4core_down_v6:
+	@ r0 =       ogg_int16_t *_y (destination)
+	@ r1 = const ogg_int16_t *_x (source)
+@ Stage 1:
+	LDRD	r10,r11,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, r3, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, r5, [r1], #16	@ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, r7, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+@ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	@ r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	@ r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, r7, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+@ Stage 2:
+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
+@ Stage 3:
+idct4_4core_down_stage3_v6:
+	SADD16	r11,r12,r2		@ r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		@ r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+	@ .size idct4_4core_down_v6, .-idct4_4core_down_v6	@ ENDP
+
+	@ .type idct8_8core_v6, %function; idct8_8core_v6: @ PROC
+idct8_8core_v6:
+	STMFD	r13!,{r0,r14}
+@ Stage 1:
+	@5-6 rotation by 3pi/16
+	LDRD	r10,r11,OC_C5S3_4_v6	@ r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
+	@2-3 rotation by 6pi/16
+	LDRD	r10,r11,OC_C6S2_4_v6	@ r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
+	@4-7 rotation by 7pi/16
+	LDRD	r10,r11,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
+	@0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		@ r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		@ r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		@ r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		@ r8 = t[1,1]=OC_C4S4*r4T>>16
+@ Stage 2:
+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
+@ Stage 3:
+	SADD16	r11,r8, r2		@ r11= t[1]=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		@ r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+	@ .size idct8_8core_v6, .-idct8_8core_v6	@ ENDP
+
+@ Another copy so the LDRD offsets are less than +/- 255.
+	.balign 8
+OC_C7S1_8_v6:
+	.word	12785 @ 31F1
+OC_C1S7_8_v6:
+	.word	64277 @ FB15
+OC_C6S2_8_v6:
+	.word	25080 @ 61F8
+OC_C2S6_8_v6:
+	.word	60547 @ EC83
+OC_C5S3_8_v6:
+	.word	36410 @ 8E3A
+OC_C3S5_8_v6:
+	.word	54491 @ D4DB
+
+	@ .type idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
+idct8_8core_down_v6:
+	STMFD	r13!,{r0,r14}
+@ Stage 1:
+	@5-6 rotation by 3pi/16
+	LDRD	r10,r11,OC_C5S3_8_v6	@ r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
+	@2-3 rotation by 6pi/16
+	LDRD	r10,r11,OC_C6S2_8_v6	@ r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
+	@4-7 rotation by 7pi/16
+	LDRD	r10,r11,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
+	@0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		@ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		@ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		@ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		@ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+@ Stage 2:
+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
+@ Stage 3:
+	SADD16	r11,r8, r2		@ r11= t[1]+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		@ r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6:
+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		@ r10= t[0]+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		@ r3 = t[3]+8=t[0]-t[3]+8
+@ Stage 4:
+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		@ y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		@ y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		@ y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	@ r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		@ y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	@ r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		@ y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	@ r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		@ y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	@ r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+	@ .size idct8_8core_down_v6, .-idct8_8core_down_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	_oc_idct8x8_1_neon
+	.global	_oc_idct8x8_neon
+
+	.balign 16
+OC_IDCT_CONSTS_NEON:
+	.short	    8
+	.short	64277 @ FB15 (C1S7)
+	.short	60547 @ EC83 (C2S6)
+	.short	54491 @ D4DB (C3S5)
+	.short	46341 @ B505 (C4S4)
+	.short	36410 @ 471D (C5S3)
+	.short	25080 @ 30FC (C6S2)
+	.short	12785 @ 31F1 (C7S1)
+
+	@ .type oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
+_oc_idct8x8_1_neon:
+	@ r0 = ogg_int16_t  *_y
+	@ r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
+	VST1.64		{D0, D1, D2, D3}, [r0,:128]
+	MOV	PC, r14
+	@ .size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon	@ ENDP
+
+	@ .type oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
+_oc_idct8x8_neon:
+	@ r0 = ogg_int16_t *_y
+	@ r1 = ogg_int16_t *_x
+	@ r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon:
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	@ Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2,:128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2,:128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2,:128]!
+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2,:128]
+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3,:128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+@ Stage 4
+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
+	@ 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	@ Column transforms
+	BL	oc_idct8x8_stage123_neon
+	@ We have to put the return address back in the LR, or the branch
+	@  predictor will not recognize the function return and mis-predict the
+	@  entire call stack.
+	MOV	r14, r12
+@ Stage 4
+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1,:128]
+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	@ .size oc_idct8x8_neon, .-oc_idct8x8_neon	@ ENDP
+
+	@ .type oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
+oc_idct8x8_stage123_neon:
+@ Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	@ Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	@ Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	@ Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	@ Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	@ Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	@ Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	@ Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	@ Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	@ Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	@ Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	@ Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	@ Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	@ Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	@ Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	@ Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	@ Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	@ Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	@ Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	@ Q4 = t[4]=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	@ Q7 = t[7]=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	@ Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	@ Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	@ Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	@ Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	@ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	@ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
+@ Stage 3
+	VSUB.S16	Q11,Q8, Q3	@ Q11 = t[3]=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	@ Q8  = t[0]=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	@ Q9  = t[1]=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	@ Q3  = t[6]=t[6]+t[5]
+	VSUB.S16	Q10,Q1, Q2	@ Q10 = t[2]=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	@ Q5  = t[5]=t[6]-t[5]
+	MOV	PC, r14
+	@ .size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon	@ ENDP
+
+	@ .type oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
+oc_idct8x8_10_neon:
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3,:128]
+	MOV	r2, r1
+	@ Row transforms (input is pre-transposed)
+@ Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2,:128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	@ Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2,:64], r12
+	VMULL.S16	Q2, D18,D0[1]	@ Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2,:64]
+	VMULL.S16	Q14,D17,D0[2]	@ Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	@ Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	@ Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	@ Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	@ Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	@ D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	@ D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	@ D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	@ D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	@ D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	@ D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	@ D2 = t[2]
+	VADD.S16	D4, D4, D18	@ D4 = t[7]
+	VADD.S16	D6, D6, D19	@ D6 = t[6]
+	VADD.S16	D7, D7, D19	@ D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	@ D30= t[0]
+					@ D31= t[3]
+@ Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	@ D24= t[7]-t[6]
+					@ D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	@ D26= t[7]=t[7]+t[6]
+					@ D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	@ Q11= OC_C4S4*(t[7]-t[6])
+					@       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	@ Q14= OC_C4S4*(t[4]-t[5])
+					@       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	@ D16= t[0]=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	@ D17= t[2]=t[0]-t[2]
+	VADD.S16	D18,D30,D2	@ D18= t[1]=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	@ D22= (OC_C4S4*(t[7]-t[6])>>16)
+					@       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	@ D23= (OC_C4S4*(t[4]-t[5])>>16)
+					@       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	@ D19= t[3]=t[0]-t[3]
+	VADD.S16	D22,D22,D24	@ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	@ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	@ D27= t[5]=t[6]-t[5]
+	VADD.S16	D24,D22,D23	@ D24= t[6]=t[6]+t[5]
+@ Stage 4
+	VSUB.S16	Q11,Q8, Q13	@ D22= y[7]=t[0]-t[7]
+					@ D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	@ D20= y[6]=t[1]-t[6]
+					@ D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	@ D16= y[0]=t[0]+t[7]
+					@ D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	@ D18= y[1]=t[1]-t[6]
+					@ D19= y[3]=t[3]'-t[4]''
+	@ 8x4 transpose
+	VTRN.16		Q10,Q11		@ Q10= c5c4a5a4 c7c6a7a6
+					@ Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		@ Q8 = c3c2a3a2 c1c0a1a0
+					@ Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		@ Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		@ Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		@ Q9 = b7b6b5b4 b3b2b1b0
+					@ Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		@ Q8 = a7a6a5a4 a3a2a1a0
+					@ Q10= c7c6c5c4 c3c2c1c0
+	@ Column transforms
+@ Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	@ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	@ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	@  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	@ Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	@ Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	@ Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	@ Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	@ Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	@ Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	@  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	@  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	@ Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	@ Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	@  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	@ Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	@ Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	@ Q15= t[7]=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	@ Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	@ Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	@ Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	@ Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q10= OC_C4S4*(t[4]-t[5])
+					@           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	@ Q12= t[4]=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	@ Q13:Q14= OC_C4S4*(t[6]-t[7])
+					@           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	@ Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					@       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	@ Q11= t[0]=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	@ Q3 = t[3]=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	@ Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					@       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	@ Q10=t[5]
+	VADD.S16	Q14,Q14,Q9	@ Q14=t[6]
+	VSUB.S16	Q13,Q14,Q10	@ Q13=t[5]=t[6]-t[5]
+	VADD.S16	Q14,Q14,Q10	@ Q14=t[6]=t[6]+t[5]
+	VADD.S16	Q10,Q1, Q2	@ Q10= t[1]=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	@ Q2 = t[2]=t[0]-t[2]
+@ Stage 4
+	VADD.S16	Q8, Q11,Q15	@ Q8  = y[0]=t[0]+t[7]
+	VADD.S16	Q9, Q10,Q14	@ Q9  = y[1]=t[1]+t[6]
+	VSUB.S16	Q15,Q11,Q15	@ Q15 = y[7]=t[0]-t[7]
+	VSUB.S16	Q14,Q10,Q14	@ Q14 = y[6]=t[1]-t[6]
+	VADD.S16	Q10,Q2, Q13	@ Q10 = y[2]=t[2]+t[5]
+	VADD.S16	Q11,Q3, Q12	@ Q11 = y[3]=t[3]+t[4]
+	VSUB.S16	Q12,Q3, Q12	@ Q12 = y[4]=t[3]-t[4]
+	VSUB.S16	Q13,Q2, Q13	@ Q13 = y[5]=t[2]-t[5]
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1,:64], r12
+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1,:64]
+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+	@ .size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon	@ ENDP
+  .endif
+
+	@ END
+    @ .section	.note.GNU-stack,"",%progbits
+#endif

+ 126 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armint.h

@@ -0,0 +1,126 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+#  if defined(__ARMEB__)
+#   error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+#  endif
+
+#  define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+   of the things that depend on structure offsets.
+  We reuse the function pointer with the wrong prototype, though.*/
+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+  ((oc_loop_filter_frag_rows_arm_func) \
+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+   (_bv), \
+   (_state)->frags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset, \
+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+   (_state)->frag_buf_offs, \
+   (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#  if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+#   if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#    if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

+ 691 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armloop.asm

@@ -0,0 +1,691 @@
+#ifdef OC_ARM_ASM
+@********************************************************************
+@*                                                                  *
+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+@*                                                                  *
+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+@*                                                                  *
+@********************************************************************
+@ Original implementation:
+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+@ last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
+@********************************************************************
+
+    .text;   .p2align 2
+
+	.global	_oc_loop_filter_frag_rows_arm
+
+@ Which bit this is depends on the order of packing within a bitfield.
+@ Hopefully that doesn't change among any of the relevant compilers.
+ .set OC_FRAG_CODED_FLAG,	1
+
+	@ Vanilla ARM v4 version
+	@ .type loop_filter_h_arm, %function; loop_filter_h_arm: @ PROC
+loop_filter_h_arm:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp:
+	LDRB	r3, [r0, #-2]		@ r3 = _pix[0]
+	LDRB	r12,[r0, #1]		@ r12= _pix[3]
+	LDRB	r4, [r0, #-1]		@ r4 = _pix[1]
+	LDRB	r5, [r0]		@ r5 = _pix[2]
+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	@ Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+	@ @ .size loop_filter_h_arm, .-loop_filter_h_arm	@ ENDP
+
+	@ .type loop_filter_v_arm, %function; loop_filter_v_arm: @ PROC
+loop_filter_v_arm:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp:
+	LDRB	r3, [r0, -r1, LSL #1]	@ r3 = _pix[0]
+	LDRB	r12,[r0, r1]		@ r12= _pix[3]
+	LDRB	r4, [r0, -r1]		@ r4 = _pix[1]
+	LDRB	r5, [r0]		@ r5 = _pix[2]
+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	@ Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+	@ @ .size loop_filter_v_arm, .-loop_filter_v_arm	@ ENDP
+
+	@ .type oc_loop_filter_frag_rows_arm, %function; oc_loop_filter_frag_rows_arm: @ PROC
+_oc_loop_filter_frag_rows_arm:
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	@ _bv += 127
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	@			  bail
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_arm_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		@ r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	@ @ .size oc_loop_filter_frag_rows_arm, .-oc_loop_filter_frag_rows_arm	@ ENDP
+
+  .if OC_ARM_ASM_MEDIA
+	.global	_oc_loop_filter_init_v6
+	.global	_oc_loop_filter_frag_rows_v6
+
+	@ .type oc_loop_filter_init_v6, %function; oc_loop_filter_init_v6: @ PROC
+_oc_loop_filter_init_v6:
+	@ r0 = _bv
+	@ r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		@ r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		@ r1 = ll=r10x0xFF
+	ORR	r1, r1, r1, LSL #8	@ r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	@ r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+	@ @ .size oc_loop_filter_init_v6, .-oc_loop_filter_init_v6	@ ENDP
+
+@ We could use the same strategy as the v filter below, but that would require
+@  40 instructions to load the data and transpose it into columns and another
+@  32 to write out the results at the end, plus the 52 instructions to do the
+@  filtering itself.
+@ This is slightly less, and less code, even assuming we could have shared the
+@  52 instructions in the middle with the other function.
+@ It executes slightly fewer instructions than the ARMv6 approach David Conrad
+@  proposed for FFmpeg, but not by much:
+@  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+@ His is a lot less code, though, because it only does two rows at once instead
+@  of four.
+	@ .type loop_filter_h_v6, %function; loop_filter_h_v6: @ PROC
+loop_filter_h_v6:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	MOV	r12, 0x0003
+	MOVT	r12, 0x1
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+	@ @ .size loop_filter_h_v6, .-loop_filter_h_v6	@ ENDP
+
+	@ .type loop_filter_h_core_v6, %function; loop_filter_h_core_v6: @ PROC
+loop_filter_h_core_v6:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ r12= 0x10003
+	@ Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		@ r4 = <p3|p2|p1|p0>
+	@ Single issue
+	LDR	r5,[r0, r1]!		@ r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		@ r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		@ r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		@ r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		@ r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	@ r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	@ r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		@ r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		@ r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		@ r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		@ r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		@ r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	@ r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		@ r6 = <r0|r2>
+	UXTB16	r11,r11			@ r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		@ r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		@ r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	@ r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		@ r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		@ r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		@ r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	@ r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		@ r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	@ r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	@ r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	@ r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			@ r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	@ r6 = <-R_s|-R_q|-R_r|-R_p>
+	@ Single issue
+	@ There's no min, max or abs instruction.
+	@ SSUB8 and SEL will work for abs, and we can do all the rest with
+	@  unsigned saturated adds, which means the GE flags are still all
+	@  set when we're done computing lflim(abs(R_i),L).
+	@ This allows us to both add and subtract, and split the results by
+	@  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	@ Single issue
+	SEL	r7, r7, r6		@ r7 = abs(R_i)
+	@ Single issue
+	UQADD8	r4, r7, r2		@ r4 = 255-max(2*L-abs(R_i),0)
+	@ Single issue
+	UQADD8	r7, r7, r4
+	@ Single issue
+	UQSUB8	r7, r7, r4		@ r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	@ Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		@ r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		@ r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		@ r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		@ r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		@ r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		@ r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		@ r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		@ r4 = q1
+	STRB	r4, [r0,#-1]
+	@ Single issue
+	STRB	r9, [r0,-r1]!
+	@ Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+	@ @ .size loop_filter_h_core_v6, .-loop_filter_h_core_v6	@ ENDP
+
+@ This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+@  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+@ This works just as well, with the following procedure for computing the
+@  filter value, f:
+@   u = ~UHADD8(p1,~p2);
+@   v = UHADD8(~p1,p2);
+@   m = v-u;
+@   a = m^UHADD8(m^p0,m^~p3);
+@   f = UHADD8(UHADD8(a,u1),v1);
+@  where f = 127+R, with R in [-127,128] defined as in the spec.
+@ This is exactly the same amount of arithmetic as the version that uses PAVGB
+@  as the basic operator.
+@ It executes about 2/3 the number of instructions of David Conrad's approach,
+@  but requires more code, because it does all eight columns at once, instead
+@  of four at a time.
+	@ .type loop_filter_v_v6, %function; loop_filter_v_v6: @ PROC
+loop_filter_v_v6:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int            _ll
+	@ preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, r7, [r0, -r1]!		@ r7, r6 = <p5|p1>
+	LDRD	r4, r5, [r0, -r1]		@ r5, r4 = <p4|p0>
+	LDRD	r8, r9, [r0, r1]!		@ r9, r8 = <p6|p2>
+	MVN	r14,r6			@ r14= ~p1
+	LDRD	r10,r11,[r0, r1]		@ r11,r10= <p7|p3>
+	@ Filter the first four columns.
+	MVN	r12,r8			@ r12= ~p2
+	UHADD8	r14,r14,r8		@ r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		@ r12= p1+~p2>>1
+	MVN	r10, r10		@ r10=~p3
+	MVN	r12,r12			@ r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		@ r14= m1=v1-u1
+	@ Single issue
+	EOR	r4, r4, r14		@ r4 = m1^p0
+	EOR	r10,r10,r14		@ r10= m1^~p3
+	UHADD8	r4, r4, r10		@ r4 = (m1^p0)+(m1^~p3)>>1
+	@ Single issue
+	EOR	r4, r4, r14		@ r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		@ r14= v1=m1+u1
+	UHADD8	r4, r4, r12		@ r4 = a1+u1>>1
+	MVN	r12,r9			@ r12= ~p6
+	UHADD8	r4, r4, r14		@ r4 = f1=(a1+u1>>1)+v1>>1
+	@ Filter the second four columns.
+	MVN	r14,r7			@ r14= ~p5
+	UHADD8	r12,r12,r7		@ r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		@ r14= v2=~p5+p6>>1
+	MVN	r12,r12			@ r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			@ r11=~p7
+	SSUB8	r10,r14,r12		@ r10= m2=v2-u2
+	@ Single issue
+	EOR	r5, r5, r10		@ r5 = m2^p4
+	EOR	r11,r11,r10		@ r11= m2^~p7
+	UHADD8	r5, r5, r11		@ r5 = (m2^p4)+(m2^~p7)>>1
+	@ Single issue
+	EOR	r5, r5, r10		@ r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	@ Single issue
+	UHADD8	r5, r5, r12		@ r5 = a2+u2>>1
+	MOV	r12, #0x7F7F		@ r12 = {127}x4
+	MOVT	r12, #0x7F7F		@ r12 = {127}x4
+	UHADD8	r5, r5, r14		@ r5 = f2=(a2+u2>>1)+v2>>1
+	@ Now split f[i] by sign.
+	@ There's no min or max instruction.
+	@ We could use SSUB8 and SEL, but this is just as many instructions and
+	@  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		@ r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		@ r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		@ r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		@ r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		@ r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		@ r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		@ r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		@ r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		@ r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		@ r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		@ r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		@ r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		@ r7 = p5+lflim(R_i,L)
+	STRD	r6, r7, [r0, -r1]		@ [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		@ r9 = p6-lflim(R_i,L)
+	STRD	r8, r9, [r0]		@ [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+	@ @ .size loop_filter_v_v6, .-loop_filter_v_v6	@ ENDP
+
+	@ .type oc_loop_filter_frag_rows_v6, %function; oc_loop_filter_frag_rows_v6: @ PROC
+_oc_loop_filter_frag_rows_v6:
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	@ ll = *(int *)_bv
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	@			  bail
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_v6_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		@ if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	@ @ .size oc_loop_filter_frag_rows_v6, .-oc_loop_filter_frag_rows_v6	@ ENDP
+  .endif
+
+  .if OC_ARM_ASM_NEON
+	.global	_oc_loop_filter_init_neon
+	.global	_oc_loop_filter_frag_rows_neon
+
+	@ .type oc_loop_filter_init_neon, %function; oc_loop_filter_init_neon: @ PROC
+_oc_loop_filter_init_neon:
+	@ r0 = _bv
+	@ r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  @ r1 = 2*L
+	VDUP.S16	Q15, r1		@ Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0,:128]
+	MOV	PC,r14
+	@ @ .size oc_loop_filter_init_neon, .-oc_loop_filter_init_neon	@ ENDP
+
+	@ .type loop_filter_h_neon, %function; loop_filter_h_neon: @ PROC
+loop_filter_h_neon:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	@ We assume Q15= 2*L in U16s
+	@                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	@ Doing a 2-element structure load saves doing two VTRN's below, at the
+	@  cost of using two more slower single-lane loads vs. the faster
+	@  all-lane loads.
+	@ It's less code this way, though, and benches a hair faster, but it
+	@  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		@ D0 = ____________1100     2,1
+						@ D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		@ D4 = ____________5544     2,1
+						@ D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		@ D0 = ________99881100     3,1
+						@ D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		@ D4 = ________DDCC5544     3,1
+						@ D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		@ D0 = ____GGHH99881100     3,1
+						@ D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		@ D4 = ____KKLLDDCC5544     3,1
+						@ D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		@ D0 = PPOOGGHH99881100     3,1
+						@ D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		@ D4 = TTSSKKLLDDCC5544     3,1
+						@ D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	@ D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	@ D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	@ Q8 = 22 - 11 in S16s              1,3
+	ADD	r12,r0, #8
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
+	PLD	[r12,r1, LSL #1]
+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	ADD	r12,r12,r1, LSL #2
+	@  We want to do
+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	@ So we've reduced the left and right hand terms to be the same, except
+	@ for a negation.
+	@ Stall x3
+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
+	PLD	[r12,-r1]
+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
+	PLD	[r12]
+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
+	PLD	[r12,r1]
+	VMOVL.U8	Q1, D2	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	PLD	[r12,r1,LSL #1]
+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	ADD	r12,r12,r1, LSL #2
+	@ Now we need to correct for the sign of f.
+	@ For negative elements of Q0, we want to subtract the appropriate
+	@ element of Q9. For positive elements we want to add them. No NEON
+	@ instruction exists to do this, so we need to negate the negative
+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	VADD.S16	Q9, Q9, Q0	@				    1,3
+	PLD	[r12,-r1]
+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
+	@ Bah. No VRSBW.U8
+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		@ D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		@ D2 = UUQQMMIIEEAA6622		    1,1
+	SUB	r12,r0, #1
+	VTRN.8	D4, D2		@ D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+	@ @ .size loop_filter_h_neon, .-loop_filter_h_neon	@ ENDP
+
+	@ .type loop_filter_v_neon, %function; loop_filter_v_neon: @ PROC
+loop_filter_v_neon:
+	@ r0 = unsigned char *_pix
+	@ r1 = int            _ystride
+	@ r2 = int           *_bv
+	@ preserves r0-r3
+	@ We assume Q15= 2*L in U16s
+	@                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12,:64], r1		@ D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12,:64], r1		@ D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12,:64], r1		@ D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12,:64]			@ D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	@ Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
+	ADD	r12, #8
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12]
+	VADD.S16	Q0, Q0, Q8	@                                   1,3
+	PLD	[r12,r1]
+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
+	SUB	r12, r0, r1
+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	@  We want to do
+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	@ So we've reduced the left and right hand terms to be the same, except
+	@ for a negation.
+	@ Stall x3
+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
+	@ Stall x2
+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	@ Stall x2
+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	@ Now we need to correct for the sign of f.
+	@ For negative elements of Q0, we want to subtract the appropriate
+	@ element of Q9. For positive elements we want to add them. No NEON
+	@ instruction exists to do this, so we need to negate the negative
+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	@ Stall x3
+	VADD.S16	Q9, Q9, Q0	@				    1,3
+	@ Stall x2
+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
+	@ Bah. No VRSBW.U8
+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		@ D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		@ D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12,:64], r1
+	VST1.64	{D4}, [r12,:64], r1
+	MOV	PC,r14
+	@ @ .size loop_filter_v_neon, .-loop_filter_v_neon	@ ENDP
+
+	@ .type oc_loop_filter_frag_rows_neon, %function; oc_loop_filter_frag_rows_neon: @ PROC
+_oc_loop_filter_frag_rows_neon:
+	@ r0 = _ref_frame_data
+	@ r1 = _ystride
+	@ r2 = _bv
+	@ r3 = _frags
+	@ r4 = _fragi0
+	@ r5 = _fragi0_end
+	@ r6 = _fragi_top
+	@ r7 = _fragi_bot
+	@ r8 = _frag_buf_offs
+	@ r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end	@   bail
+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	@		  bail
+	VLD1.64	{D30,D31}, [r2,:128]	@ Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
+oslffri_neon_lp1:
+	MOV	r10,r4		@ r10= fragi = _fragi0
+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2:
+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	@ r0 = _ref_frame_data
+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		@ if (fragi>_fragi0)
+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	@ r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		@ if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end:
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded:
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+	@ @ .size oc_loop_filter_frag_rows_neon, .-oc_loop_filter_frag_rows_neon	@ ENDP
+  .endif
+
+	@ END
+    @ .section	.note.GNU-stack,"",%progbits
+#endif

+ 219 - 0
modules/theoraplayer/native/theora/lib/arm_llvm/armstate.c

@@ -0,0 +1,219 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
+  /*Note: We _must_ set this function pointer, because the macro in armint.h
+     calls it with different arguments, so the C version will segfault.*/
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+#    endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+  }
+#   endif
+#  endif
+# endif
+}
+
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+#  endif
+# endif
+
+#endif

+ 114 - 0
modules/theoraplayer/native/theora/lib/bitpack.c

@@ -0,0 +1,114 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $
+
+ ********************************************************************/
+#include <string.h>
+#include <stdlib.h>
+#include "bitpack.h"
+
+/*We're 'MSb' endian; if we write a word but read individual bits,
+   then we'll read the MSb first.*/
+
+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes){
+  memset(_b,0,sizeof(*_b));
+  _b->ptr=_buf;
+  _b->stop=_buf+_bytes;
+}
+
+static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  unsigned             shift;
+  stop=_b->stop;
+  ptr=_b->ptr;
+  window=_b->window;
+  available=_b->bits;
+  shift=OC_PB_WINDOW_SIZE-available;
+  while(7<shift&&ptr<stop){
+    shift-=8;
+    window|=(oc_pb_window)*ptr++<<shift;
+  }
+  _b->ptr=ptr;
+  available=OC_PB_WINDOW_SIZE-shift;
+  if(_bits>available){
+    if(ptr>=stop){
+      _b->eof=1;
+      available=OC_LOTS_OF_BITS;
+    }
+    else window|=*ptr>>(available&7);
+  }
+  _b->bits=available;
+  return window;
+}
+
+int oc_pack_look1(oc_pack_buf *_b){
+  oc_pb_window window;
+  int          available;
+  window=_b->window;
+  available=_b->bits;
+  if(available<1)_b->window=window=oc_pack_refill(_b,1);
+  return window>>OC_PB_WINDOW_SIZE-1;
+}
+
+void oc_pack_adv1(oc_pack_buf *_b){
+  _b->window<<=1;
+  _b->bits--;
+}
+
+/*Here we assume that 0<=_bits&&_bits<=32.*/
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
+  oc_pb_window window;
+  int          available;
+  long         result;
+  window=_b->window;
+  available=_b->bits;
+  if(_bits==0)return 0;
+  if(available<_bits){
+    window=oc_pack_refill(_b,_bits);
+    available=_b->bits;
+  }
+  result=window>>OC_PB_WINDOW_SIZE-_bits;
+  available-=_bits;
+  window<<=1;
+  window<<=_bits-1;
+  _b->window=window;
+  _b->bits=available;
+  return result;
+}
+
+int oc_pack_read1_c(oc_pack_buf *_b){
+  oc_pb_window window;
+  int          available;
+  int          result;
+  window=_b->window;
+  available=_b->bits;
+  if(available<1){
+    window=oc_pack_refill(_b,1);
+    available=_b->bits;
+  }
+  result=window>>OC_PB_WINDOW_SIZE-1;
+  available--;
+  window<<=1;
+  _b->window=window;
+  _b->bits=available;
+  return result;
+}
+
+long oc_pack_bytes_left(oc_pack_buf *_b){
+  if(_b->eof)return -1;
+  return _b->stop-_b->ptr+(_b->bits>>3);
+}

+ 76 - 0
modules/theoraplayer/native/theora/lib/bitpack.h

@@ -0,0 +1,76 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
+
+ ********************************************************************/
+#if !defined(_bitpack_H)
+# define _bitpack_H (1)
+# include <stddef.h>
+# include <limits.h>
+# include "internal.h"
+
+
+
+typedef size_t             oc_pb_window;
+typedef struct oc_pack_buf oc_pack_buf;
+
+
+
+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+#  include "arm/armbits.h"
+# endif
+
+# if !defined(oc_pack_read)
+#  define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+#  define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+#  define oc_huff_token_decode oc_huff_token_decode_c
+# endif
+
+# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+# define OC_LOTS_OF_BITS (0x40000000)
+
+
+
+struct oc_pack_buf{
+  const unsigned char *stop;
+  const unsigned char *ptr;
+  oc_pb_window         window;
+  int                  bits;
+  int                  eof;
+};
+
+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
+int oc_pack_look1(oc_pack_buf *_b);
+void oc_pack_adv1(oc_pack_buf *_b);
+/*Here we assume 0<=_bits&&_bits<=32.*/
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
+/* returns -1 for read beyond EOF, or the number of whole bytes available */
+long oc_pack_bytes_left(oc_pack_buf *_b);
+
+/*These two functions are implemented locally in huffdec.c*/
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+/*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/
+/*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/
+
+#endif

+ 153 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xdec.c

@@ -0,0 +1,153 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include "c64xdec.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_dec_accel_init_c64x(oc_dec_ctx *_dec){
+# if defined(OC_DEC_USE_VTABLE)
+  _dec->opt_vtable.dc_unpredict_mcu_plane=oc_dec_dc_unpredict_mcu_plane_c64x;
+# endif
+}
+
+
+/*Undo the DC prediction in a single plane of an MCU (one or two super block
+   rows).
+  As a side effect, the number of coded and uncoded fragments in this plane of
+   the MCU is also computed.*/
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frags;
+  int                     *pred_last;
+  ptrdiff_t                ncoded_fragis;
+  ptrdiff_t                fragi;
+  int                      fragx;
+  int                      fragy;
+  int                      fragy0;
+  int                      fragy_end;
+  int                      nhfrags;
+  /*Compute the first and last fragment row of the current MCU for this
+     plane.*/
+  fplane=_dec->state.fplanes+_pli;
+  fragy0=_pipe->fragy0[_pli];
+  fragy_end=_pipe->fragy_end[_pli];
+  nhfrags=fplane->nhfrags;
+  pred_last=_pipe->pred_last[_pli];
+  frags=_dec->state.frags;
+  ncoded_fragis=0;
+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+  for(fragy=fragy0;fragy<fragy_end;fragy++){
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int coded;
+        int refi;
+        /*The TI compiler refuses to pipeline this if we put it in an if(coded)
+           block.
+          We can do the loads unconditionally, which helps move them earlier.
+          We do the store unconditionally too, because if we use a conditional
+           store, the compiler propagates the condition back to the operations
+           the store depended on, presumably to reduce cache pressure by
+           eliminating dead loads.
+          However, these loads are "free" in the cache sense, since reading the
+           coded flag brings in all four bytes anyway, and starting the loads
+           before we know the coded flag saves 6 cycles.*/
+        refi=frags[fragi].refi;
+        coded=frags[fragi].coded;
+        frags[fragi].dc=pred_last[refi]+=frags[fragi].dc&-coded;
+        ncoded_fragis+=coded;
+      }
+    }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].refi;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        int refi;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else ur_ref=u_frags[fragi+1].refi;
+        refi=frags[fragi].refi;
+        if(frags[fragi].coded){
+          static const int OC_PRED_SCALE[16][2]={
+            {0x00000000,0x00000000},
+            {0x00000000,0x00000080},
+            {0x00800000,0x00000000},
+            {0x00000000,0x00000080},
+            {0x00000080,0x00000000},
+            {0x00000040,0x00000040},
+            {0x00000080,0x00000000},
+            {0xFF980074,0x00000074},
+            {0x00000000,0x00800000},
+            {0x00000000,0x0035004B},
+            {0x00400000,0x00400000},
+            {0x00000000,0x0035004B},
+            {0x00000080,0x00000000},
+            {0x00000000,0x0035004B},
+            {0x00180050,0x00180000},
+            {0xFF980074,0x00000074},
+          };
+          ogg_int16_t p0;
+          ogg_int16_t p1;
+          ogg_int16_t p2;
+          ogg_int16_t p3;
+          int         pred;
+          int         pflags;
+          /*29 cycles.*/
+          /*HACK: This p0 reference could potentially be out of bounds, but
+             because we know what allocator Leonora is using, we know it can't
+             segfault.*/
+          p0=u_frags[fragi-1].dc;
+          p1=u_frags[fragi].dc;
+          p2=u_frags[fragi+1].dc;
+          p3=frags[fragi-1].dc;
+          pflags=_cmpeq4(_packl4(_pack2(ur_ref,u_ref),_pack2(ul_ref,l_ref)),
+           _packl4(_pack2(refi,refi),_pack2(refi,refi)));
+          if(pflags==0)pred=pred_last[refi];
+          else{
+            pred=(_dotp2(_pack2(p0,p1),OC_PRED_SCALE[pflags][0])
+             +_dotp2(_pack2(p2,p3),OC_PRED_SCALE[pflags][1]))/128;
+            if((pflags&7)==7){
+              if(abs(pred-p1)>128)pred=p1;
+              else if(abs(pred-p3)>128)pred=p3;
+              else if(abs(pred-p0)>128)pred=p0;
+            }
+          }
+          pred_last[refi]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=refi;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  /*Also save the number of uncoded fragments so we know how many to copy.*/
+  _pipe->nuncoded_fragis[_pli]=
+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
+}
+
+#endif

+ 33 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xdec.h

@@ -0,0 +1,33 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_c64x_c64xdec_H)
+# define _c64x_c64xdec_H (1)
+# include "c64xint.h"
+
+# if defined(OC_C64X_ASM)
+#  define oc_dec_accel_init oc_dec_accel_init_c64x
+#  define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c64x
+# endif
+
+# include "../decint.h"
+
+void oc_dec_accel_init_c64x(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
+#endif

+ 447 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xfrag.c

@@ -0,0 +1,447 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+
+
+
+/*14 cycles.*/
+void oc_frag_copy_c64x(unsigned char *restrict _dst,
+ const unsigned char *restrict _src,int _ystride){
+  unsigned char *restrict       d2;
+  const unsigned char *restrict s2;
+  d2=_dst+_ystride;
+  s2=_src+_ystride;
+#define OC_ITER() \
+  do{ \
+    _amem8(_dst)=_mem8(_src); \
+    _dst+=2*_ystride; \
+    _src+=2*_ystride; \
+    _amem8(d2)=_mem8(s2); \
+    d2+=2*_ystride; \
+    s2+=2*_ystride; \
+  } \
+  while(0)
+  OC_ITER();
+  OC_ITER();
+  OC_ITER();
+  OC_ITER();
+#undef OC_ITER
+}
+
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  /*9 cycles per iteration.*/
+  for(fragii=0;fragii<_nfragis;fragii++){
+    const unsigned char *restrict src;
+    const unsigned char *restrict s2;
+    unsigned char       *restrict dst;
+    unsigned char       *restrict d2;
+    ptrdiff_t                     frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    dst=_dst_frame+frag_buf_off;
+    src=_src_frame+frag_buf_off;
+    d2=dst+_ystride;
+    s2=src+_ystride;
+#define OC_ITER() \
+  do{ \
+    _amem8(dst)=_amem8_const(src); \
+    dst+=2*_ystride; \
+    src+=2*_ystride; \
+    _amem8(d2)=_amem8_const(s2); \
+    d2+=2*_ystride; \
+    s2+=2*_ystride; \
+  } \
+  while(0)
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+#undef OC_ITER
+  }
+}
+
+/*34 cycles.*/
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       x1;
+    int       y1;
+    int       x2;
+    int       y2;
+    ll=_amem8_const(_residue+i*8+0);
+    x1=_sadd2(_loll(ll),0x00800080);
+    y1=_sadd2(_hill(ll),0x00800080);
+    ll=_amem8_const(_residue+i*8+4);
+    x2=_sadd2(_loll(ll),0x00800080);
+    y2=_sadd2(_hill(ll),0x00800080);
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+  }
+}
+
+/*41 cycles.*/
+void oc_frag_recon_inter_c64x(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       x1;
+    int       y1;
+    int       z1;
+    int       x2;
+    int       y2;
+    int       z2;
+    ll=_mem8_const(_src);
+    z1=_loll(ll);
+    z2=_hill(ll);
+    ll=_amem8_const(_residue+i*8+0);
+    x1=_sadd2(_unpklu4(z1),_loll(ll));
+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
+    ll=_amem8_const(_residue+i*8+4);
+    x2=_sadd2(_unpklu4(z2),_loll(ll));
+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+    _src+=_ystride;
+  }
+}
+
+/*56 cycles.*/
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
+ const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    long long ll;
+    int       a;
+    int       b;
+    int       c;
+    int       d;
+    int       x1;
+    int       y1;
+    int       z1;
+    int       x2;
+    int       y2;
+    int       z2;
+    ll=_mem8_const(_src1);
+    a=_loll(ll);
+    b=_hill(ll);
+    ll=_mem8_const(_src2);
+    c=_loll(ll);
+    d=_hill(ll);
+    ll=_amem8_const(_residue+i*8+0);
+    z1=~_avgu4(~a,~c);
+    x1=_sadd2(_unpklu4(z1),_loll(ll));
+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
+    ll=_amem8_const(_residue+i*8+4);
+    z2=~_avgu4(~b,~d);
+    x2=_sadd2(_unpklu4(z2),_loll(ll));
+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}
+
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            refi;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    int         p;
+    long long   ll;
+    int         ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5;
+    ll=_itoll(_pack2(p,p),_pack2(p,p));
+    for(ci=0;ci<64;ci+=4)_amem8(_dct_coeffs+64+ci)=ll;
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_c64x(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  refi=_state->frags[_fragi].refi;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs+64);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
+       ystride,_dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+/*46 cycles.*/
+static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
+  int p0;
+  int p1;
+  int p2;
+  int p3;
+  int p4;
+  int p5;
+  int p6;
+  int p7;
+  int y;
+  _pix-=2;
+  /*Do all the loads now to avoid the compiler's inability to prove they're not
+     dependent on the stores later.*/
+  p0=_mem4(_pix+_ystride*0);
+  p1=_mem4(_pix+_ystride*1);
+  p2=_mem4(_pix+_ystride*2);
+  p3=_mem4(_pix+_ystride*3);
+  p4=_mem4(_pix+_ystride*4);
+  p5=_mem4(_pix+_ystride*5);
+  p6=_mem4(_pix+_ystride*6);
+  p7=_mem4(_pix+_ystride*7);
+  for(y=0;y<8;y+=4){
+    int f;
+    int a;
+    int b;
+    int u;
+    int v;
+    /*We could pack things right after the dot product, but delaying it
+       actually saves three cycles due to better instruction scheduling.*/
+    a=_dotpsu4(0x01FD03FF,p0)+3>>3;
+    b=_dotpsu4(0x01FD03FF,p1)+3>>3;
+    u=_dotpsu4(0x01FD03FF,p2)+3>>3;
+    v=_dotpsu4(0x01FD03FF,p3)+3>>3;
+    f=_packl4(_pack2(v,u),_pack2(b,a));
+    /*We split the results by sign and work with abs(f) here, since the C64x
+       signed-unsigned addition with unsigned saturation is only available for
+       16-bit operands.
+      For 8-bit operands, we have to emulate it with a saturated addition and a
+       saturated subtraction using separate unsigned values.
+      There's no direct support for 8-bit saturated subtraction, either, so we
+       have to emulate that as well, using either x-_minu4(x,y) or
+       ~_saddu4(~x,y), depending on which one schedules better.*/
+    f=_add4(0x80808080,f);
+    b=_minu4(0x80808080,f);
+    a=0x80808080-b;
+    b=f-b;
+    /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+    u=_saddu4(a,_ll);
+    v=_saddu4(b,_ll);
+    a=_saddu4(a,u);
+    b=_saddu4(b,v);
+    a=a-_minu4(a,u);
+    b=b-_minu4(b,v);
+    /*Apply the changes to the original pixels.*/
+    u=_pack2(p1>>8,p0>>8);
+    v=_pack2(p3>>8,p2>>8);
+    p1=_packl4(v,u);
+    p2=_packh4(v,u);
+    p1=_saddu4(~_saddu4(~p1,b),a);
+    p2=_saddu4(p2-_minu4(p2,a),b);
+    /*For unaligned short stores, we have to store byte by byte.
+      It's faster to do it explicitly than to use _mem2().*/
+    _pix[_ystride*0+1]=(unsigned char)p1;
+    _pix[_ystride*0+2]=(unsigned char)p2;
+    _pix[_ystride*1+1]=(unsigned char)(p1>>8);
+    _pix[_ystride*1+2]=(unsigned char)(p2>>8);
+    _pix[_ystride*2+1]=(unsigned char)(p1>>16);
+    _pix[_ystride*2+2]=(unsigned char)(p2>>16);
+    _pix[_ystride*3+1]=(unsigned char)(p1>>24);
+    _pix[_ystride*3+2]=(unsigned char)(p2>>24);
+    p0=p4;
+    p1=p5;
+    p2=p6;
+    p3=p7;
+    _pix+=4*_ystride;
+  }
+}
+
+/*38 cycles.*/
+static void loop_filter_v(unsigned char * restrict _pix,int _ystride,int _ll){
+  long long ll;
+  int       p0;
+  int       p1;
+  int       p2;
+  int       p3;
+  int       p4;
+  int       p5;
+  int       p6;
+  int       p7;
+  int       a1;
+  int       b1;
+  int       f1;
+  int       m1;
+  int       u1;
+  int       v1;
+  int       a2;
+  int       b2;
+  int       f2;
+  int       m2;
+  int       u2;
+  int       v2;
+  /*Do all the loads now to avoid the compiler's inability to prove they're not
+     dependent on the stores later.*/
+  ll=_amem8(_pix-_ystride*2);
+  p0=_loll(ll);
+  p4=_hill(ll);
+  ll=_amem8(_pix-_ystride*1);
+  p1=_loll(ll);
+  p5=_hill(ll);
+  ll=_amem8(_pix+_ystride*0);
+  p2=_loll(ll);
+  p6=_hill(ll);
+  ll=_amem8(_pix+_ystride*1);
+  p3=_loll(ll);
+  p7=_hill(ll);
+  /*I can't find a way to put the rest in a loop that the compiler thinks is
+     unrollable, so instead it's unrolled manually.*/
+  /*This first part is based on the transformation
+    f = -(3*(p2-p1)+p0-p3+4>>3)
+      = -(3*(p2+255-p1)+(p0+255-p3)+4-1020>>3)
+      = -(3*(p2+~p1)+(p0+~p3)-1016>>3)
+      = 127-(3*(p2+~p1)+(p0+~p3)>>3)
+      = 128+~(3*(p2+~p1)+(p0+~p3)>>3) (mod 256).
+    Although _avgu4(a,b) = (a+b+1>>1) (biased up), we rely heavily on the
+     fact that ~_avgu4(~a,~b) = (a+b>>1) (biased down).*/
+  /*We need this first average both biased up and biased down.*/
+  u1=~_avgu4(~p1,p2);
+  v1=_avgu4(p1,~p2);
+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+  m1=_sub4(u1,v1);
+  a1=m1^_avgu4(m1^~p0,m1^p3);
+  f1=_avgu4(_avgu4(a1,u1),v1);
+  /*Instead of removing the bias by 128, we use it to split f by sign, since
+     the C64x signed-unsigned addition with unsigned saturation is only
+     available for 16-bit operands.
+    For 8-bit operands, we have to emulate it with a saturated addition and a
+     saturated subtraction using separate unsigned values.
+    There's no direct support for 8-bit saturated subtraction, either, so we
+     have to emulate that as well, using either x-_minu4(x,y) or
+     ~_saddu4(~x,y), depending on which one schedules better.*/
+  b1=_minu4(0x80808080,f1);
+  a1=0x80808080-b1;
+  b1=f1-b1;
+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+  u1=_saddu4(a1,_ll);
+  v1=_saddu4(b1,_ll);
+  a1=_saddu4(a1,u1);
+  b1=_saddu4(b1,v1);
+  a1=a1-_minu4(a1,u1);
+  b1=b1-_minu4(b1,v1);
+  /*Apply the changes to the original pixels.*/
+  p1=_saddu4(p1-_minu4(p1,b1),a1);
+  p2=_saddu4(p2-_minu4(p2,a1),b1);
+  /*We need this first average both biased up and biased down.*/
+  u2=~_avgu4(~p5,p6);
+  v2=_avgu4(p5,~p6);
+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
+  m2=_sub4(u2,v2);
+  a2=m2^_avgu4(m2^~p4,m2^p7);
+  f2=_avgu4(_avgu4(a2,u2),v2);
+  /*Instead of removing the bias by 128, we use it to split f by sign.*/
+  b2=_minu4(0x80808080,f2);
+  a2=0x80808080-b2;
+  b2=f2-b2;
+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
+  u2=_saddu4(a2,_ll);
+  v2=_saddu4(b2,_ll);
+  a2=_saddu4(a2,u2);
+  b2=_saddu4(b2,v2);
+  a2=a2-_minu4(a2,u2);
+  b2=b2-_minu4(b2,v2);
+  /*Apply the changes to the original pixels.*/
+  p5=_saddu4(p5-_minu4(p5,b2),a2);
+  p6=_saddu4(p6-_minu4(p6,a2),b2);
+  /*Write out the results.*/
+  _amem8(_pix-_ystride)=_itoll(p5,p1);
+  _amem8(_pix)=_itoll(p6,p2);
+}
+
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit){
+  int ll;
+  ll=_flimit<<1;
+  ll=_pack2(ll,ll);
+  ll=~_spacku4(ll,ll);
+  *((int *)_bv)=ll;
+}
+
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  int                      ll;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  ll=*((int *)_bv);
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0)loop_filter_h(ref,ystride,ll);
+        if(fragi0>fragi_top)loop_filter_v(ref,ystride,ll);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          loop_filter_h(ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          loop_filter_v(ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}

+ 415 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xidct.c

@@ -0,0 +1,415 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+#include <string.h>
+#include "c64xint.h"
+#include "dct.h"
+
+#define OC_C1S7D ((OC_C1S7<<16)|(OC_C1S7&0xFFFF))
+#define OC_C2S6D ((OC_C2S6<<16)|(OC_C2S6&0xFFFF))
+#define OC_C3S5D ((OC_C3S5<<16)|(OC_C3S5&0xFFFF))
+#define OC_C4S4D ((OC_C4S4<<16)|(OC_C4S4&0xFFFF))
+#define OC_C5S3D ((OC_C5S3<<16)|(OC_C5S3&0xFFFF))
+#define OC_C6S2D ((OC_C6S2<<16)|(OC_C6S2&0xFFFF))
+#define OC_C7S1D ((OC_C7S1<<16)|(OC_C7S1&0xFFFF))
+
+/*Various building blocks for the iDCT implementations.
+  These are done in macros instead of functions so that we can use all local
+   variables, which avoids leaving the compiler to try to sort out memory
+   reference dependencies.*/
+
+/*Load two rows into x0...x7.*/
+#define OC_IDCT8x2_LOAD8(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+    x2=_loll(ll); \
+    x3=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
+    x4=_loll(ll); \
+    x5=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
+    x6=_loll(ll); \
+    x7=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two rows into x0...x3.
+  Uses ll as a temporary.*/
+#define OC_IDCT8x2_LOAD4(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
+    x2=_loll(ll); \
+    x3=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two rows into x0...x1.*/
+#define OC_IDCT8x2_LOAD2(_x) \
+  do{ \
+    long long ll; \
+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
+    x0=_loll(ll); \
+    x1=_hill(ll); \
+  } \
+  while(0)
+
+/*Load two columns into x0...x1.*/
+#define OC_IDCT8x2_LOAD2T(_x) \
+  do{ \
+    x0=_amem4_const((_x)+(0<<3)); \
+    x1=_amem4_const((_x)+(1<<3)); \
+  } \
+  while(0)
+
+/*Transform x0...x7 into t0...t7.*/
+#define OC_IDCT8x2() \
+  do{ \
+    long long ll; \
+    int       a; \
+    int       b; \
+    /*Stage 1:*/ \
+    ll=_addsub2(x0,x4); \
+    a=_hill(ll); \
+    b=_loll(ll); \
+    t0=_packh2(_mpyhus(OC_C4S4D,a),_mpyus(OC_C4S4D,a)); \
+    t1=_packh2(_mpyhus(OC_C4S4D,b),_mpyus(OC_C4S4D,b)); \
+    ll=_mpy2ll(OC_C6S2D,x2); \
+    a=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(OC_C2S6D,x6); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
+    t2=_sub2(a,b); \
+    ll=_mpy2ll(OC_C2S6D,x2); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+    ll=_mpy2ll(OC_C6S2D,x6); \
+    b=_packh2(_hill(ll),_loll(ll)); \
+    t3=_add2(a,b); \
+    ll=_mpy2ll(OC_C7S1D,x1); \
+    a=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(OC_C1S7D,x7); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
+    t4=_sub2(a,b); \
+    ll=_mpy2ll(OC_C3S5D,x5); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+    ll=_mpy2ll(OC_C5S3D,x3); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    t5=_sub2(a,b); \
+    ll=_mpy2ll(OC_C5S3D,x5); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
+    ll=_mpy2ll(OC_C3S5D,x3); \
+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    t6=_add2(a,b); \
+    ll=_mpy2ll(OC_C1S7D,x1); \
+    a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    ll=_mpy2ll(OC_C7S1D,x7); \
+    b=_packh2(_hill(ll),_loll(ll)); \
+    t7=_add2(a,b); \
+    /*Stage 2:*/ \
+    ll=_addsub2(t4,t5); \
+    t4=_hill(ll); \
+    b=_loll(ll); \
+    ll=_mpy2ll(OC_C4S4D,b); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+    ll=_addsub2(t7,t6); \
+    t7=_hill(ll); \
+    b=_loll(ll); \
+    ll=_mpy2ll(OC_C4S4D,b); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
+    /*Stage 3:*/ \
+    ll=_addsub2(t0,t3); \
+    t0=_hill(ll); \
+    t3=_loll(ll); \
+    ll=_addsub2(t1,t2); \
+    t1=_hill(ll); \
+    t2=_loll(ll); \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
+#define OC_IDCT8x2_4() \
+  do{ \
+    long long ll; \
+    int       a; \
+    /*Stage 1:*/ \
+    ll=_mpy2ll(OC_C4S4D,x0); \
+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+    t1=t0; \
+    ll=_mpy2ll(OC_C6S2D,x2); \
+    t2=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(OC_C2S6D,x2); \
+    t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
+    ll=_mpy2ll(OC_C7S1D,x1); \
+    t4=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(OC_C5S3D,x3); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    ll=_mpy2ll(OC_C3S5D,x3); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
+    ll=_mpy2ll(OC_C1S7D,x1); \
+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    /*Stage 2:*/ \
+    ll=_addsub2(t4,t5); \
+    t4=_loll(ll); \
+    a=_hill(ll); \
+    ll=_mpy2ll(OC_C4S4D,a); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+    ll=_addsub2(t7,t6); \
+    t7=_hill(ll); \
+    a=_loll(ll); \
+    ll=_mpy2ll(OC_C4S4D,a); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
+    /*Stage 3:*/ \
+    ll=_addsub2(t0,t3); \
+    t0=_hill(ll); \
+    t3=_loll(ll); \
+    ll=_addsub2(t1,t2); \
+    t1=_hill(ll); \
+    t2=_loll(ll); \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
+#define OC_IDCT8x2_2() \
+  do{ \
+    long long ll; \
+    /*Stage 1:*/ \
+    ll=_mpy2ll(OC_C4S4D,x0); \
+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
+    t1=t0; \
+    ll=_mpy2ll(OC_C7S1D,x1); \
+    t4=_packh2(_hill(ll),_loll(ll)); \
+    ll=_mpy2ll(OC_C1S7D,x1); \
+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
+    /*Stage 2:*/ \
+    ll=_mpy2ll(OC_C4S4D,t4); \
+    t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
+    ll=_mpy2ll(OC_C4S4D,t7); \
+    t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
+    /*Stage 3:*/ \
+    t3=t0; \
+    t2=t1; \
+    ll=_addsub2(t6,t5); \
+    t6=_hill(ll); \
+    t5=_loll(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7 and store two rows.*/
+#define OC_IDCT8x2_STORE(_y) \
+  do{ \
+    long long ll; \
+    int       a; \
+    int       b; \
+    int       c; \
+    int       d; \
+    /*Stage 4:*/ \
+    ll=_addsub2(t0,t7); \
+    a=_hill(ll); \
+    c=_loll(ll); \
+    ll=_addsub2(t1,t6); \
+    b=_hill(ll); \
+    d=_loll(ll); \
+    ll=_dpack2(b,a); \
+    _amem4((_y)+0)=_loll(ll); \
+    _amem4((_y)+8)=_hill(ll); \
+    ll=_dpack2(c,d); \
+    _amem4((_y)+6)=_loll(ll); \
+    _amem4((_y)+14)=_hill(ll); \
+    ll=_addsub2(t2,t5); \
+    a=_hill(ll); \
+    c=_loll(ll); \
+    ll=_addsub2(t3,t4); \
+    b=_hill(ll); \
+    d=_loll(ll); \
+    ll=_dpack2(b,a); \
+    _amem4((_y)+2)=_loll(ll); \
+    _amem4((_y)+10)=_hill(ll); \
+    ll=_dpack2(c,d); \
+    _amem4((_y)+4)=_loll(ll); \
+    _amem4((_y)+12)=_hill(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7 and store two columns.*/
+#define OC_IDCT8x2_STORET(_y) \
+  do{ \
+    long long ll; \
+    /*Stage 4:*/ \
+    ll=_addsub2(t0,t7); \
+    _amem4((_y)+(0<<3))=_hill(ll); \
+    _amem4((_y)+(7<<3))=_loll(ll); \
+    ll=_addsub2(t1,t6); \
+    _amem4((_y)+(1<<3))=_hill(ll); \
+    _amem4((_y)+(6<<3))=_loll(ll); \
+    ll=_addsub2(t2,t5); \
+    _amem4((_y)+(2<<3))=_hill(ll); \
+    _amem4((_y)+(5<<3))=_loll(ll); \
+    ll=_addsub2(t3,t4); \
+    _amem4((_y)+(3<<3))=_hill(ll); \
+    _amem4((_y)+(4<<3))=_loll(ll); \
+  } \
+  while(0)
+
+/*Finish transforming t0...t7, round and scale, and store two columns.*/
+#define OC_IDCT8x2_ROUND_STORET(_y) \
+  do{ \
+    long long ll; \
+    /*Stage 4:*/ \
+    /*Adjust for the scale factor.*/ \
+    ll=_addsub2(t0,t7); \
+    _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t1,t6); \
+    _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t2,t5); \
+    _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+    ll=_addsub2(t3,t4); \
+    _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
+    _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
+  } \
+  while(0)
+
+/*196 cycles.*/
+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         x0;
+  int         x1;
+  int         x2;
+  int         x3;
+  int         x4;
+  int         x5;
+  int         x6;
+  int         x7;
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         i;
+  /*Transform rows of x into columns of w.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD8(_x+i*8);
+    _amem8(_x+i*8)=0LL;
+    _amem8(_x+i*8+4)=0LL;
+    _amem8(_x+i*8+8)=0LL;
+    _amem8(_x+i*8+12)=0LL;
+    OC_IDCT8x2();
+    OC_IDCT8x2_STORET(w+i);
+  }
+  /*Transform rows of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD8(w+i*8);
+    OC_IDCT8x2();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+
+/*106 cycles.*/
+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         x0;
+  int         x1;
+  int         x2;
+  int         x3;
+  int         i;
+  /*Transform rows of x into columns of w.*/
+  OC_IDCT8x2_LOAD4(_x);
+  OC_IDCT8x2_4();
+  OC_IDCT8x2_STORET(w);
+  OC_IDCT8x2_LOAD2(_x+16);
+  _amem8(_x)=0LL;
+  _amem8(_x+8)=0LL;
+  _amem4(_x+16)=0;
+  _amem4(_x+24)=0;
+  OC_IDCT8x2_2();
+  OC_IDCT8x2_STORET(w+2);
+  /*Transform rows of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD4(w+i*8);
+    OC_IDCT8x2_4();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+
+#if 0
+/*This used to compile to something faster (88 cycles), but no longer, and I'm
+   not sure what changed to cause this.
+  In any case, it's barely an advantage over the 10-coefficient version, and is
+   now hardly worth the icache space.*/
+/*95 cycles.*/
+static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         x0;
+  int         x1;
+  int         i;
+  /*Transform rows of x into rows of w.*/
+  for(i=0;i<2;i+=2){
+    OC_IDCT8x2_LOAD2(_x+i*8);
+    OC_IDCT8x2_2();
+    OC_IDCT8x2_STORE(w+i*8);
+  }
+  _amem4(_x)=0;
+  _amem4(_x+8)=0;
+  /*Transform columns of w into columns of y.*/
+  for(i=0;i<8;i+=2){
+    OC_IDCT8x2_LOAD2T(w+i);
+    OC_IDCT8x2_2();
+    OC_IDCT8x2_ROUND_STORET(_y+i);
+  }
+}
+#endif
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
+  else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
+  else oc_idct8x8_slow_c64x(_y,_x);
+}

+ 67 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xint.h

@@ -0,0 +1,67 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_c64x_c64xint_H)
+# define _c64x_c64xint_H (1)
+# include "../internal.h"
+
+# if defined(OC_C64X_ASM)
+#  define oc_state_accel_init oc_state_accel_init_c64x
+#  define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_c64x(_dst,_src,_ystride)
+#  define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_c64x(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
+#  define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
+  oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
+#  define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
+#  define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
+#  define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_c64x(_y,_x,_last_zzi)
+#  define oc_state_frag_recon oc_state_frag_recon_c64x
+#  define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_c64x(_bv,_flimit)
+#  define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
+#  define oc_restore_fpu(_state) do{}while(0)
+# endif
+
+# include "../state.h"
+
+void oc_state_accel_init_c64x(oc_theora_state *_state);
+
+void oc_frag_copy_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_c64x(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit);
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif

+ 39 - 0
modules/theoraplayer/native/theora/lib/c64x/c64xstate.c

@@ -0,0 +1,39 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#include "c64xint.h"
+
+#if defined(OC_C64X_ASM)
+
+void oc_state_accel_init_c64x(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy=oc_frag_copy_c64x;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c64x;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
+  _state->opt_vtable.idct8x8=oc_idct8x8_c64x;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c64x;
+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c64x;
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   oc_state_loop_filter_frag_rows_c64x;
+  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+# endif
+}
+
+#endif

+ 974 - 0
modules/theoraplayer/native/theora/lib/collect.c

@@ -0,0 +1,974 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include <string.h>
+#include "collect.h"
+
+#if defined(OC_COLLECT_METRICS)
+
+int              OC_HAS_MODE_METRICS;
+double           OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+double           OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+const char      *OC_MODE_METRICS_FILENAME="modedec.stats";
+
+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _s,int _q,int _r,double _d){
+  if(_metrics->w>0){
+    double ds;
+    double dq;
+    double dr;
+    double dd;
+    double ds2;
+    double dq2;
+    double s2;
+    double sq;
+    double q2;
+    double sr;
+    double qr;
+    double sd;
+    double qd;
+    double s2q;
+    double sq2;
+    double w;
+    double wa;
+    double rwa;
+    double rwa2;
+    double rwb;
+    double rwb2;
+    double rw2;
+    double rw3;
+    double rw4;
+    wa=_metrics->w;
+    ds=_s-_metrics->s/wa;
+    dq=_q-_metrics->q/wa;
+    dr=_r-_metrics->r/wa;
+    dd=_d-_metrics->d/wa;
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2=_metrics->s2;
+    sq=_metrics->sq;
+    q2=_metrics->q2;
+    sr=_metrics->sr;
+    qr=_metrics->qr;
+    sd=_metrics->sd;
+    qd=_metrics->qd;
+    s2q=_metrics->s2q;
+    sq2=_metrics->sq2;
+    w=wa+_w;
+    rwa=wa/w;
+    rwb=_w/w;
+    rwa2=rwa*rwa;
+    rwb2=rwb*rwb;
+    rw2=wa*rwb;
+    rw3=rw2*(rwa2-rwb2);
+    rw4=_w*rwa2*rwa2+wa*rwb2*rwb2;
+    _metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb
+     +(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4;
+    _metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3;
+    _metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3;
+    _metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3;
+    _metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3;
+    _metrics->s2+=ds2*rw2;
+    _metrics->sq+=ds*dq*rw2;
+    _metrics->q2+=dq2*rw2;
+    _metrics->sr+=ds*dr*rw2;
+    _metrics->qr+=dq*dr*rw2;
+    _metrics->r2+=dr*dr*rw2;
+    _metrics->sd+=ds*dd*rw2;
+    _metrics->qd+=dq*dd*rw2;
+    _metrics->d2+=dd*dd*rw2;
+  }
+  _metrics->w+=_w;
+  _metrics->s+=_s*_w;
+  _metrics->q+=_q*_w;
+  _metrics->r+=_r*_w;
+  _metrics->d+=_d*_w;
+}
+
+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n){
+  int i;
+  /*Find a non-empty set of metrics.*/
+  for(i=0;i<_n&&_src[i].w==0;i++);
+  if(i>=_n){
+    memset(_dst,0,sizeof(*_dst));
+    return;
+  }
+  memcpy(_dst,_src+i,sizeof(*_dst));
+  /*And iterate over the remaining non-empty sets of metrics.*/
+  for(i++;i<_n;i++)if(_src[i].w!=0){
+    double ds;
+    double dq;
+    double dr;
+    double dd;
+    double ds2;
+    double dq2;
+    double s2a;
+    double s2b;
+    double sqa;
+    double sqb;
+    double q2a;
+    double q2b;
+    double sra;
+    double srb;
+    double qra;
+    double qrb;
+    double sda;
+    double sdb;
+    double qda;
+    double qdb;
+    double s2qa;
+    double s2qb;
+    double sq2a;
+    double sq2b;
+    double w;
+    double wa;
+    double wb;
+    double rwa;
+    double rwb;
+    double rwa2;
+    double rwb2;
+    double rw2;
+    double rw3;
+    double rw4;
+    wa=_dst->w;
+    wb=_src[i].w;
+    ds=_src[i].s/wb-_dst->s/wa;
+    dq=_src[i].q/wb-_dst->q/wa;
+    dr=_src[i].r/wb-_dst->r/wa;
+    dd=_src[i].d/wb-_dst->d/wa;
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2a=_dst->s2;
+    sqa=_dst->sq;
+    q2a=_dst->q2;
+    sra=_dst->sr;
+    qra=_dst->qr;
+    sda=_dst->sd;
+    qda=_dst->qd;
+    s2qa=_dst->s2q;
+    sq2a=_dst->sq2;
+    s2b=_src[i].s2;
+    sqb=_src[i].sq;
+    q2b=_src[i].q2;
+    srb=_src[i].sr;
+    qrb=_src[i].qr;
+    sdb=_src[i].sd;
+    qdb=_src[i].qd;
+    s2qb=_src[i].s2q;
+    sq2b=_src[i].sq2;
+    w=wa+wb;
+    if(w==0)rwa=rwb=0;
+    else{
+      rwa=wa/w;
+      rwb=wb/w;
+    }
+    rwa2=rwa*rwa;
+    rwb2=rwb*rwb;
+    rw2=wa*rwb;
+    rw3=rw2*(rwa2-rwb2);
+    rw4=wb*rwa2*rwa2+wa*rwb2*rwb2;
+    /*
+    (1,1,1) ->
+     (0,0,0)#
+     (1,0,0) C(1,1)*C(1,0)*C(1,0)->  d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1})
+     (0,1,0) C(1,0)*C(1,1)*C(1,0)->  d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1})
+     (0,0,1) C(1,0)*C(1,0)*C(1,1)->  d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0})
+     (1,1,0)*
+     (1,0,1)*
+     (0,1,1)*
+     (1,1,1) C(1,1)*C(1,1)*C(1,1)->  d^{1,1,1}*(rwa^3*wb-rwb^3*wa)
+    (2,1) ->
+     (0,0)#
+     (1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1})
+     (0,1) C(2,0)*C(1,1)->  d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0})
+     (2,0)*
+     (1,1)*
+     (2,1) C(2,2)*C(1,1)->  d^{2,1}*(rwa^3*wb-rwb^3*wa)
+    (2,2) ->
+     (0,0)#
+     (1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2})
+     (0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1})
+     (2,0) C(2,2)*C(2,0)->  d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2})
+     (1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1})
+     (0,2) C(2,0)*C(2,2)->  d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0})
+     (1,2)*
+     (2,1)*
+     (2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa)
+    */
+    _dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa))
+     +ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa)
+     +dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4;
+    _dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa)
+     +dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3;
+    _dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a)
+     +2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3;
+    _dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra)
+     +dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3;
+    _dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda)
+     +dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3;
+    _dst->s2+=_src[i].s2+ds2*rw2;
+    _dst->sq+=_src[i].sq+ds*dq*rw2;
+    _dst->q2+=_src[i].q2+dq2*rw2;
+    _dst->sr+=_src[i].sr+ds*dr*rw2;
+    _dst->qr+=_src[i].qr+dq*dr*rw2;
+    _dst->r2+=_src[i].r2+dr*dr*rw2;
+    _dst->sd+=_src[i].sd+ds*dd*rw2;
+    _dst->qd+=_src[i].qd+dq*dd*rw2;
+    _dst->d2+=_src[i].d2+dd*dd*rw2;
+    _dst->w+=_src[i].w;
+    _dst->s+=_src[i].s;
+    _dst->q+=_src[i].q;
+    _dst->r+=_src[i].r;
+    _dst->d+=_src[i].d;
+  }
+}
+
+/*Adjust a single corner of a set of metric bins to minimize the squared
+   prediction error of R and D.
+  Each bin is assumed to cover a quad like so:
+    (s0,q0)    (s1,q0)
+       A----------B
+       |          |
+       |          |
+       |          |
+       |          |
+       C----------Z
+    (s0,q1)    (s1,q1)
+  The values A, B, and C are fixed, and Z is the free parameter.
+  Then, for example, R_i is predicted via bilinear interpolation as
+    x_i=(s_i-s0)/(s1-s0)
+    y_i=(q_i-q0)/(q1-q0)
+    dRds1_i=A+(B-A)*x_i
+    dRds2_i=C+(Z-C)*x_i
+    R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i
+  To find the Z that minimizes the squared prediction error over i, this can
+   be rewritten as
+    R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z
+  Letting X={...,x_i*y_i,...}^T and
+   Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T,
+   the optimal Z is given by Z=(X^T.Y)/(X^T.X).
+  Now, we need to compute these dot products without actually storing data for
+   each sample.
+  Starting with X^T.X, we have
+   X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2).
+  Expanding the interior of the sum in a monomial basis of s_i and q_i gives
+    s0^2*q0^2  *(1)
+     -2*s0*q0^2*(s_i)
+     -2*s0^2*q0*(q_i)
+     +q0^2     *(s_i^2)
+     +4*s0*q0  *(s_i*q_i)
+     +s0^2     *(q_i^2)
+     -2*q0     *(s_i^2*q_i)
+     -2*s0     *(s_i*q_i^2)
+     +1        *(s_i^2*q_i^2).
+  However, computing things directly in this basis leads to gross numerical
+   errors, as most of the terms will have similar size and destructive
+   cancellation results.
+  A much better basis is the central (co-)moment basis:
+    {1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2,
+     (s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2},
+   where sbar and qbar are the average s and q values over the bin,
+   respectively.
+  In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is
+    ds^2*dq^2*(1)
+     +dq^2   *((s_i-sbar)^2)
+     +4*ds*dq*((s_i-sbar)*(q_i-qbar))
+     +ds^2   *((q_i-qbar)^2)
+     +2*dq   *((s_i-sbar)^2*(q_i-qbar))
+     +2*ds   *((s_i-sbar)*(q_i-qbar)^2)
+     +1      *((s_i-sbar)^2*(q_i-qbar)^2).
+  With these expressions in the central (co-)moment bases, all we need to do
+   is compute sums over the (co-)moment terms, which can be done
+   incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()),
+   with no need to store the individual samples.
+  Now, for X^T.Y, we have
+    X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0)
+     -((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/
+     ((s1-s0)*(q1-q0)),
+   or, rewriting the constants to simplify notation,
+    X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0)
+     +C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)).
+  Again, converting to the central (co-)moment basis, the interior of the
+   above sum is
+    ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq)  *(1)
+     +(C1*dq+C3*dq^2)                     *((s_i-sbar)^2)
+     +(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar))
+     +(C2*ds+C3*ds^2)                     *((q_i-qbar)^2)
+     +dq                                  *((s_i-sbar)*(r_i-rbar))
+     +ds                                  *((q_i-qbar)*(r_i-rbar))
+     +(C1+2*C3*dq)                        *((s_i-sbar)^2*(q_i-qbar))
+     +(C2+2*C3*ds)                        *((s_i-sbar)*(q_i-qbar)^2)
+     +1                                   *((s_i-sbar)*(q_i-qbar)*(r_i-rbar))
+     +C3                                  *((s_i-sbar)^2*(q_i-qbar)^2).
+  You might think it would be easier (if perhaps slightly less robust) to
+   accumulate terms directly around s0 and q0.
+  However, we update each corner of the bins in turn, so we would have to
+   change basis to move the sums from corner to corner anyway.*/
+double oc_mode_metrics_solve(double *_r,double *_d,
+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
+ const int *_q0,const int *_q1,
+ const double *_ra,const double *_rb,const double *_rc,
+ const double *_da,const double *_db,const double *_dc,int _n){
+  double xx;
+  double rxy;
+  double dxy;
+  double wt;
+  int i;
+  xx=rxy=dxy=wt=0;
+  for(i=0;i<_n;i++)if(_metrics[i].w>0){
+    double s10;
+    double q10;
+    double sq10;
+    double ds;
+    double dq;
+    double ds2;
+    double dq2;
+    double r;
+    double d;
+    double s2;
+    double sq;
+    double q2;
+    double sr;
+    double qr;
+    double sd;
+    double qd;
+    double s2q;
+    double sq2;
+    double sqr;
+    double sqd;
+    double s2q2;
+    double c0;
+    double c1;
+    double c2;
+    double c3;
+    double w;
+    w=_metrics[i].w;
+    wt+=w;
+    s10=_s1[i]-_s0[i];
+    q10=_q1[i]-_q0[i];
+    sq10=s10*q10;
+    ds=_metrics[i].s/w-_s0[i];
+    dq=_metrics[i].q/w-_q0[i];
+    ds2=ds*ds;
+    dq2=dq*dq;
+    s2=_metrics[i].s2;
+    sq=_metrics[i].sq;
+    q2=_metrics[i].q2;
+    s2q=_metrics[i].s2q;
+    sq2=_metrics[i].sq2;
+    s2q2=_metrics[i].s2q2;
+    xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10);
+    r=_metrics[i].r/w;
+    sr=_metrics[i].sr;
+    qr=_metrics[i].qr;
+    sqr=_metrics[i].sqr;
+    c0=-_ra[i];
+    c1=-(_rb[i]-_ra[i])/s10;
+    c2=-(_rc[i]-_ra[i])/q10;
+    c3=-(_ra[i]-_rb[i]-_rc[i])/sq10;
+    rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
+     +(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr
+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10;
+    d=_metrics[i].d/w;
+    sd=_metrics[i].sd;
+    qd=_metrics[i].qd;
+    sqd=_metrics[i].sqd;
+    c0=-_da[i];
+    c1=-(_db[i]-_da[i])/s10;
+    c2=-(_dc[i]-_da[i])/q10;
+    c3=-(_da[i]-_db[i]-_dc[i])/sq10;
+    dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
+     +(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd
+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10;
+  }
+  if(xx>1E-3){
+    *_r=rxy/xx;
+    *_d=dxy/xx;
+  }
+  else{
+    *_r=0;
+    *_d=0;
+  }
+  return wt;
+}
+
+/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
+   useful for mode decision.*/
+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
+ int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
+  int niters;
+  int prevdr;
+  int prevdd;
+  int dr;
+  int dd;
+  int pli;
+  int qti;
+  int qi;
+  int si;
+  dd=dr=INT_MAX;
+  niters=0;
+  /*The encoder interpolates rate and RMSE terms bilinearly from an
+     OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
+    To find the sample values at the grid points that minimize the total
+     squared prediction error actually requires solving a relatively sparse
+     linear system with a number of variables equal to the number of grid
+     points.
+    Instead of writing a general sparse linear system solver, we just use
+     Gauss-Seidel iteration, i.e., we update one grid point at time until
+     they stop changing.*/
+  do{
+    prevdr=dr;
+    prevdd=dd;
+    dd=dr=0;
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
+          for(si=0;si<OC_COMP_BINS;si++){
+            oc_mode_metrics m[4];
+            int             s0[4];
+            int             s1[4];
+            int             q0[4];
+            int             q1[4];
+            double          ra[4];
+            double          rb[4];
+            double          rc[4];
+            double          da[4];
+            double          db[4];
+            double          dc[4];
+            double          r;
+            double          d;
+            int             rate;
+            int             rmse;
+            int             ds;
+            int             n;
+            n=0;
+            /*Collect the statistics for the (up to) four bins grid point
+               (si,qi) touches.*/
+            if(qi>0&&si>0){
+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si-1<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
+            }
+            if(qi>0){
+              ds=si+1<OC_COMP_BINS?1:-1;
+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si+ds<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              da[n]=
+               ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si);
+            }
+            if(qi+1<OC_LOGQ_BINS&&si>0){
+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si-1<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi][pli][qti]+si-1);
+            }
+            if(qi+1<OC_LOGQ_BINS){
+              ds=si+1<OC_COMP_BINS?1:-1;
+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
+              s0[n]=si+ds<<_shift;
+              s1[n]=si<<_shift;
+              ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              da[n]=
+               ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
+              *(m+n++)=*(_metrics[qi][pli][qti]+si);
+            }
+            /*On the first pass, initialize with a simple weighted average of
+               the neighboring bins.*/
+            if(!OC_HAS_MODE_METRICS&&niters==0){
+              double w;
+              w=r=d=0;
+              while(n-->0){
+                w+=m[n].w;
+                r+=m[n].r;
+                d+=m[n].d;
+              }
+              r=w>1E-3?r/w:0;
+              d=w>1E-3?d/w:0;
+              _weight[qi][pli][qti][si]=w;
+            }
+            else{
+              /*Update the grid point and save the weight for later.*/
+              _weight[qi][pli][qti][si]=
+               oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
+            }
+            rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
+            rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
+            dr+=abs(rate-_table[qi][pli][qti][si].rate);
+            dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
+            _table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
+          }
+        }
+      }
+    }
+  }
+  /*After a fixed number of initial iterations, only iterate so long as the
+     total change is decreasing.
+    This ensures we don't oscillate forever, which is a danger, as all of our
+     results are rounded fairly coarsely.*/
+  while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd)));
+  if(_reweight){
+    /*Now, reduce the values of the optimal solution until we get enough
+       samples in each bin to overcome the constant OC_ZWEIGHT factor.
+      This encourages sampling under-populated bins and prevents a single large
+       sample early on from discouraging coding in that bin ever again.*/
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
+          for(si=0;si<OC_COMP_BINS;si++){
+            double wt;
+            wt=_weight[qi][pli][qti][si];
+            wt/=OC_ZWEIGHT+wt;
+            _table[qi][pli][qti][si].rate=(ogg_int16_t)
+             (_table[qi][pli][qti][si].rate*wt+0.5);
+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)
+             (_table[qi][pli][qti][si].rmse*wt+0.5);
+          }
+        }
+      }
+    }
+  }
+}
+
+/*Dump the in memory mode metrics to a file.
+  Note this data format isn't portable between different platforms.*/
+void oc_mode_metrics_dump(void){
+  FILE *fmetrics;
+  fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
+  if(fmetrics!=NULL){
+    (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
+    (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
+    (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
+    fclose(fmetrics);
+  }
+}
+
+void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
+#if !defined(OC_COLLECT_METRICS)
+ const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
+#else
+ oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
+#endif
+  int qii;
+  fprintf(_fout,
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
+    int pli;
+    fprintf(_fout,"  {\n");
+    for(pli=0;pli<3;pli++){
+      int qti;
+      fprintf(_fout,"    {\n");
+      for(qti=0;qti<2;qti++){
+        int bin;
+        int qi;
+        static const char *pl_names[3]={"Y'","Cb","Cr"};
+        static const char *qti_names[2]={"INTRA","INTER"};
+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
+        fprintf(_fout,"      /*%s  qi=%i  %s*/\n",
+         pl_names[pli],qi,qti_names[qti]);
+        fprintf(_fout,"      {\n");
+        fprintf(_fout,"        ");
+        for(bin=0;bin<OC_COMP_BINS;bin++){
+          if(bin&&!(bin&0x3))fprintf(_fout,"\n        ");
+          fprintf(_fout,"{%5i,%5i}",
+           _mode_rd_table[qii][pli][qti][bin].rate,
+           _mode_rd_table[qii][pli][qti][bin].rmse);
+          if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
+        }
+        fprintf(_fout,"\n      }");
+        if(qti<1)fprintf(_fout,",");
+        fprintf(_fout,"\n");
+      }
+      fprintf(_fout,"    }");
+      if(pli<2)fprintf(_fout,",");
+      fprintf(_fout,"\n");
+    }
+    fprintf(_fout,"  }");
+    if(qii+1<OC_LOGQ_BINS)fprintf(_fout,",");
+    fprintf(_fout,"\n");
+  }
+  fprintf(_fout,
+   "};\n"
+   "\n");
+}
+
+void oc_mode_metrics_print(FILE *_fout){
+  int qii;
+  fprintf(_fout,
+   "/*File generated by libtheora with OC_COLLECT_METRICS"
+   " defined at compile time.*/\n"
+   "#if !defined(_modedec_H)\n"
+   "# define _modedec_H (1)\n"
+   "# include \"encint.h\"\n"
+   "\n"
+   "\n"
+   "\n"
+   "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
+   "   (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
+   "  The actual statistics used by the encoder will be interpolated from\n"
+   "   that table based on log_plq for the actual quantization matrix used.*/\n"
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
+    fprintf(_fout,"  { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
+     OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
+     OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
+     qii+1<OC_LOGQ_BINS?",":"");
+  }
+  fprintf(_fout,
+   "};\n"
+   "\n");
+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
+  fprintf(_fout,
+   "#endif\n");
+}
+
+
+# if !defined(OC_COLLECT_NO_ENC_FUNCS)
+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
+  oc_restore_fpu(&_enc->state);
+  /*Load any existing mode metrics if we haven't already.*/
+  if(!OC_HAS_MODE_METRICS){
+    FILE *fmetrics;
+    memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
+    memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
+    fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
+    if(fmetrics!=NULL){
+      /*Read in the binary structures as written my oc_mode_metrics_dump().
+        Note this format isn't portable between different platforms.*/
+      (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
+      (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
+      (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
+      fclose(fmetrics);
+    }
+    else{
+      int qii;
+      int qi;
+      int pli;
+      int qti;
+      for(qii=0;qii<OC_LOGQ_BINS;qii++){
+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
+        for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+          OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti];
+        }
+      }
+    }
+    oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
+     OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
+    oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
+     OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+    OC_HAS_MODE_METRICS=1;
+  }
+}
+
+/*The following token skipping code used to also be used in the decoder (and
+   even at one point other places in the encoder).
+  However, it was obsoleted by other optimizations, and is now only used here.
+  It has been moved here to avoid generating the code when it's not needed.*/
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.*/
+typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
+
+/*Handles the simple end of block tokens.*/
+static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
+  int nblocks_adjust;
+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
+  return -_extra_bits-nblocks_adjust;
+}
+
+/*The last EOB token has a special case, where an EOB run of size zero ends all
+   the remaining blocks in the frame.*/
+static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
+  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
+     yet available everywhere; this should be equivalent.*/
+  if(!_extra_bits)return -(~(size_t)0>>1);
+  return -_extra_bits;
+}
+
+/*Handles the pure zero run tokens.*/
+static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
+  return _extra_bits+1;
+}
+
+/*Handles a normal coefficient value token.*/
+static ptrdiff_t oc_token_skip_val(void){
+  return 1;
+}
+
+/*Handles a category 1A zero run/coefficient value combo token.*/
+static ptrdiff_t oc_token_skip_run_cat1a(int _token){
+  return _token-OC_DCT_RUN_CAT1A+2;
+}
+
+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
+static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
+  int run_cati;
+  int ncoeffs_mask;
+  int ncoeffs_adjust;
+  run_cati=_token-OC_DCT_RUN_CAT1B;
+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
+}
+
+/*A jump table for computing the number of coefficients or blocks to skip for
+   a given token value.
+  This reduces all the conditional branches, etc., needed to parse these token
+   values down to one indirect jump.*/
+static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob6,
+  oc_token_skip_zrl,
+  oc_token_skip_zrl,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run
+};
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.
+          0 will never be returned, so that at least one coefficient in one
+           block will always be decoded for every token.*/
+static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
+  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
+}
+
+
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
+  static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
+     0,16,16,16,16,16,32,32,
+    32,32,32,32,32,32,32,48,
+    48,48,48,48,48,48,48,48,
+    48,48,48,48,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64
+  };
+  const oc_fragment *frags;
+  const unsigned    *frag_sad;
+  const unsigned    *frag_satd;
+  const unsigned    *frag_ssd;
+  const ptrdiff_t   *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  double             fragw;
+  int                modelines[3][3][2];
+  int                qti;
+  int                qii;
+  int                qi;
+  int                pli;
+  int                zzi;
+  int                token;
+  int                eb;
+  oc_restore_fpu(&_enc->state);
+  /*Figure out which metric bins to use for this frame's quantizers.*/
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    for(pli=0;pli<3;pli++){
+      for(qti=0;qti<2;qti++){
+        int log_plq;
+        int modeline;
+        log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti];
+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
+        modelines[qii][pli][qti]=modeline;
+      }
+    }
+  }
+  qti=_enc->state.frame_type;
+  frags=_enc->state.frags;
+  frag_sad=_enc->frag_sad;
+  frag_satd=_enc->frag_satd;
+  frag_ssd=_enc->frag_ssd;
+  coded_fragis=_enc->state.coded_fragis;
+  ncoded_fragis=fragii=0;
+  /*Weight the fragments by the inverse frame size; this prevents HD content
+     from dominating the statistics.*/
+  fragw=1.0/_enc->state.nfrags;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t ti[64];
+    int       eob_token[64];
+    int       eob_run[64];
+    /*Set up token indices and eob run counts.
+      We don't bother trying to figure out the real cost of the runs that span
+       coefficients; instead we use the costs that were available when R-D
+       token optimization was done.*/
+    for(zzi=0;zzi<64;zzi++){
+      ti[zzi]=_enc->dct_token_offs[pli][zzi];
+      if(ti[zzi]>0){
+        token=_enc->dct_tokens[pli][zzi][0];
+        eb=_enc->extra_bits[pli][zzi][0];
+        eob_token[zzi]=token;
+        eob_run[zzi]=-oc_dct_token_skip(token,eb);
+      }
+      else{
+        eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        eob_run[zzi]=0;
+      }
+    }
+    /*Scan the list of coded fragments for this plane.*/
+    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    for(;fragii<ncoded_fragis;fragii++){
+      ptrdiff_t fragi;
+      int       frag_bits;
+      int       huffi;
+      int       skip;
+      int       mb_mode;
+      unsigned  sad;
+      unsigned  satd;
+      double    sqrt_ssd;
+      int       bin;
+      int       qtj;
+      fragi=coded_fragis[fragii];
+      frag_bits=0;
+      for(zzi=0;zzi<64;){
+        if(eob_run[zzi]>0){
+          /*We've reached the end of the block.*/
+          eob_run[zzi]--;
+          break;
+        }
+        huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
+         +OC_ZZI_HUFF_OFFSET[zzi];
+        if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
+          /*This token caused an EOB run to be flushed.
+            Therefore it gets the bits associated with it.*/
+          frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
+          eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        }
+        token=_enc->dct_tokens[pli][zzi][ti[zzi]];
+        eb=_enc->extra_bits[pli][zzi][ti[zzi]];
+        ti[zzi]++;
+        skip=oc_dct_token_skip(token,eb);
+        if(skip<0){
+          eob_token[zzi]=token;
+          eob_run[zzi]=-skip;
+        }
+        else{
+          /*A regular DCT value token; accumulate the bits for it.*/
+          frag_bits+=_enc->huff_codes[huffi][token].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[token];
+          zzi+=skip;
+        }
+      }
+      mb_mode=frags[fragi].mb_mode;
+      qii=frags[fragi].qii;
+      qi=_enc->state.qis[qii];
+      sad=frag_sad[fragi]<<(pli+1&2);
+      satd=frag_satd[fragi]<<(pli+1&2);
+      sqrt_ssd=sqrt(frag_ssd[fragi]);
+      qtj=mb_mode!=OC_MODE_INTRA;
+      /*Accumulate statistics.
+        The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
+         OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
+         yet still use old data.*/
+      bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
+      oc_mode_metrics_add(
+       OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
+       fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
+      bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
+      oc_mode_metrics_add(
+       OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
+       fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
+    }
+  }
+  /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
+  oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
+   OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
+  oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
+   OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
+}
+# endif
+
+#endif

+ 109 - 0
modules/theoraplayer/native/theora/lib/collect.h

@@ -0,0 +1,109 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_collect_H)
+# define _collect_H (1)
+# include "encint.h"
+# if defined(OC_COLLECT_METRICS)
+#  include <stdio.h>
+
+
+
+typedef struct oc_mode_metrics oc_mode_metrics;
+
+
+
+/**Sets the file name to load/store mode metrics from/to.
+ * The file name string is stored by reference, and so must be valid for the
+ *  lifetime of the encoder.
+ * Mode metric collection uses global tables; do not attempt to perform
+ *  multiple collections at once.
+ * \param[in] _buf <tt>char[]</tt> The file name.
+ * \retval TH_EIMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_METRICS_FILE (0x8000)
+
+
+
+/*Accumulates various weighted sums of the measurements.
+  w -> weight
+  s -> SATD
+  q -> log quantizer
+  r -> rate (in bits)
+  d -> RMSE
+  All of the single letters correspond to direct, weighted sums, e.g.,
+   w=sum(w_i), s=sum(s_i*w_i), etc.
+  The others correspond to central moments (or co-moments) of the given order,
+   e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i).
+  Because we need some moments up to fourth order, we use central moments to
+   minimize the dynamic range and prevent rounding error from dominating the
+   calculations.*/
+struct oc_mode_metrics{
+  double w;
+  double s;
+  double q;
+  double r;
+  double d;
+  double s2;
+  double sq;
+  double q2;
+  double sr;
+  double qr;
+  double r2;
+  double sd;
+  double qd;
+  double d2;
+  double s2q;
+  double sq2;
+  double sqr;
+  double sqd;
+  double s2q2;
+};
+
+
+# define OC_ZWEIGHT   (0.25)
+
+/*TODO: It may be helpful (for block-level quantizers especially) to separate
+   out the contributions from AC and DC into separate tables.*/
+
+extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
+extern oc_mode_rd  OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+extern oc_mode_rd  OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
+
+extern int              OC_HAS_MODE_METRICS;
+extern oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+extern oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
+extern const char      *OC_MODE_METRICS_FILENAME;
+
+void oc_mode_metrics_dump();
+void oc_mode_metrics_print(FILE *_fout);
+
+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _s,int _q,int _r,double _d);
+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n);
+double oc_mode_metrics_solve(double *_r,double *_d,
+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
+ const int *_q0,const int *_q1,
+ const double *_ra,const double *_rb,const double *_rc,
+ const double *_da,const double *_db,const double *_dc,int _n);
+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
+ int shift,double (*_weight)[3][2][OC_COMP_BINS]);
+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
+
+# endif
+#endif

+ 31 - 0
modules/theoraplayer/native/theora/lib/dct.h

@@ -0,0 +1,31 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*Definitions shared by the forward and inverse DCT transforms.*/
+#if !defined(_dct_H)
+# define _dct_H (1)
+
+/*cos(n*pi/16) (resp. sin(m*pi/16)) scaled by 65536.*/
+#define OC_C1S7 ((ogg_int32_t)64277)
+#define OC_C2S6 ((ogg_int32_t)60547)
+#define OC_C3S5 ((ogg_int32_t)54491)
+#define OC_C4S4 ((ogg_int32_t)46341)
+#define OC_C5S3 ((ogg_int32_t)36410)
+#define OC_C6S2 ((ogg_int32_t)25080)
+#define OC_C7S1 ((ogg_int32_t)12785)
+
+#endif

+ 193 - 0
modules/theoraplayer/native/theora/lib/decapiwrapper.c

@@ -0,0 +1,193 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decapiwrapper.c 13596 2007-08-23 20:05:38Z tterribe $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+#include "decint.h"
+#include "theora/theoradec.h"
+
+static void th_dec_api_clear(th_api_wrapper *_api){
+  if(_api->setup)th_setup_free(_api->setup);
+  if(_api->decode)th_decode_free(_api->decode);
+  memset(_api,0,sizeof(*_api));
+}
+
+static void theora_decode_clear(theora_state *_td){
+  if(_td->i!=NULL)theora_info_clear(_td->i);
+  memset(_td,0,sizeof(*_td));
+}
+
+static int theora_decode_control(theora_state *_td,int _req,
+ void *_buf,size_t _buf_sz){
+  return th_decode_ctl(((th_api_wrapper *)_td->i->codec_setup)->decode,
+   _req,_buf,_buf_sz);
+}
+
+static ogg_int64_t theora_decode_granule_frame(theora_state *_td,
+ ogg_int64_t _gp){
+  return th_granule_frame(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
+}
+
+static double theora_decode_granule_time(theora_state *_td,ogg_int64_t _gp){
+  return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
+}
+
+static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={
+  (oc_state_clear_func)theora_decode_clear,
+  (oc_state_control_func)theora_decode_control,
+  (oc_state_granule_frame_func)theora_decode_granule_frame,
+  (oc_state_granule_time_func)theora_decode_granule_time,
+};
+
+static void th_info2theora_info(theora_info *_ci,const th_info *_info){
+  _ci->version_major=_info->version_major;
+  _ci->version_minor=_info->version_minor;
+  _ci->version_subminor=_info->version_subminor;
+  _ci->width=_info->frame_width;
+  _ci->height=_info->frame_height;
+  _ci->frame_width=_info->pic_width;
+  _ci->frame_height=_info->pic_height;
+  _ci->offset_x=_info->pic_x;
+  _ci->offset_y=_info->pic_y;
+  _ci->fps_numerator=_info->fps_numerator;
+  _ci->fps_denominator=_info->fps_denominator;
+  _ci->aspect_numerator=_info->aspect_numerator;
+  _ci->aspect_denominator=_info->aspect_denominator;
+  switch(_info->colorspace){
+    case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
+    case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
+    default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
+  }
+  switch(_info->pixel_fmt){
+    case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
+    case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
+    case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
+    default:_ci->pixelformat=OC_PF_RSVD;
+  }
+  _ci->target_bitrate=_info->target_bitrate;
+  _ci->quality=_info->quality;
+  _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
+}
+
+int theora_decode_init(theora_state *_td,theora_info *_ci){
+  th_api_info    *apiinfo;
+  th_api_wrapper *api;
+  th_info         info;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  /*Allocate our own combined API wrapper/theora_info struct.
+    We put them both in one malloc'd block so that when the API wrapper is
+     freed, the info struct goes with it.
+    This avoids having to figure out whether or not we need to free the info
+     struct in either theora_info_clear() or theora_clear().*/
+  apiinfo=(th_api_info *)_ogg_calloc(1,sizeof(*apiinfo));
+  if(apiinfo==NULL)return OC_FAULT;
+  /*Make our own copy of the info struct, since its lifetime should be
+     independent of the one we were passed in.*/
+  *&apiinfo->info=*_ci;
+  /*Convert the info struct now instead of saving the the one we decoded with
+     theora_decode_header(), since the user might have modified values (i.e.,
+     color space, aspect ratio, etc. can be specified from a higher level).
+    The user also might be doing something "clever" with the header packets if
+     they are not using an Ogg encapsulation.*/
+  oc_theora_info2th_info(&info,_ci);
+  /*Don't bother to copy the setup info; th_decode_alloc() makes its own copy
+     of the stuff it needs.*/
+  apiinfo->api.decode=th_decode_alloc(&info,api->setup);
+  if(apiinfo->api.decode==NULL){
+    _ogg_free(apiinfo);
+    return OC_EINVAL;
+  }
+  apiinfo->api.clear=(oc_setup_clear_func)th_dec_api_clear;
+  _td->internal_encode=NULL;
+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
+  _td->internal_decode=(void *)&OC_DEC_DISPATCH_VTBL;
+  _td->granulepos=0;
+  _td->i=&apiinfo->info;
+  _td->i->codec_setup=&apiinfo->api;
+  return 0;
+}
+
+int theora_decode_header(theora_info *_ci,theora_comment *_cc,ogg_packet *_op){
+  th_api_wrapper *api;
+  th_info         info;
+  int             ret;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  /*Allocate an API wrapper struct on demand, since it will not also include a
+     theora_info struct like the ones that are used in a theora_state struct.*/
+  if(api==NULL){
+    _ci->codec_setup=_ogg_calloc(1,sizeof(*api));
+    if(_ci->codec_setup==NULL)return OC_FAULT;
+    api=(th_api_wrapper *)_ci->codec_setup;
+    api->clear=(oc_setup_clear_func)th_dec_api_clear;
+  }
+  /*Convert from the theora_info struct instead of saving our own th_info
+     struct between calls.
+    The user might be doing something "clever" with the header packets if they
+     are not using an Ogg encapsulation, and we don't want to break this.*/
+  oc_theora_info2th_info(&info,_ci);
+  /*We rely on the fact that theora_comment and th_comment structures are
+     actually identical.
+    Take care not to change this fact unless you change the code here as
+     well!*/
+  ret=th_decode_headerin(&info,(th_comment *)_cc,&api->setup,_op);
+  /*We also rely on the fact that the error return code values are the same,
+    and that the implementations of these two functions return the same set of
+    them.
+   Note that theora_decode_header() really can return OC_NOTFORMAT, even
+    though it is not currently documented to do so.*/
+  if(ret<0)return ret;
+  th_info2theora_info(_ci,&info);
+  return 0;
+}
+
+int theora_decode_packetin(theora_state *_td,ogg_packet *_op){
+  th_api_wrapper *api;
+  ogg_int64_t     gp;
+  int             ret;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
+  api=(th_api_wrapper *)_td->i->codec_setup;
+  ret=th_decode_packetin(api->decode,_op,&gp);
+  if(ret<0)return OC_BADPACKET;
+  _td->granulepos=gp;
+  return 0;
+}
+
+int theora_decode_YUVout(theora_state *_td,yuv_buffer *_yuv){
+  th_api_wrapper  *api;
+  th_dec_ctx      *decode;
+  th_ycbcr_buffer  buf;
+  int              ret;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
+  api=(th_api_wrapper *)_td->i->codec_setup;
+  decode=(th_dec_ctx *)api->decode;
+  if(!decode)return OC_FAULT;
+  ret=th_decode_ycbcr_out(decode,buf);
+  if(ret>=0){
+    _yuv->y_width=buf[0].width;
+    _yuv->y_height=buf[0].height;
+    _yuv->y_stride=buf[0].stride;
+    _yuv->uv_width=buf[1].width;
+    _yuv->uv_height=buf[1].height;
+    _yuv->uv_stride=buf[1].stride;
+    _yuv->y=buf[0].data;
+    _yuv->u=buf[1].data;
+    _yuv->v=buf[2].data;
+  }
+  return ret;
+}

+ 250 - 0
modules/theoraplayer/native/theora/lib/decinfo.c

@@ -0,0 +1,250 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "decint.h"
+
+
+
+/*Unpacks a series of octets from a given byte array into the pack buffer.
+  No checking is done to ensure the buffer contains enough data.
+  _opb: The pack buffer to read the octets from.
+  _buf: The byte array to store the unpacked bytes in.
+  _len: The number of octets to unpack.*/
+static void oc_unpack_octets(oc_pack_buf *_opb,char *_buf,size_t _len){
+  while(_len-->0){
+    long val;
+    val=oc_pack_read(_opb,8);
+    *_buf++=(char)val;
+  }
+}
+
+/*Unpacks a 32-bit integer encoded by octets in little-endian form.*/
+static long oc_unpack_length(oc_pack_buf *_opb){
+  long ret[4];
+  int  i;
+  for(i=0;i<4;i++)ret[i]=oc_pack_read(_opb,8);
+  return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24;
+}
+
+static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
+  long val;
+  /*Check the codec bitstream version.*/
+  val=oc_pack_read(_opb,8);
+  _info->version_major=(unsigned char)val;
+  val=oc_pack_read(_opb,8);
+  _info->version_minor=(unsigned char)val;
+  val=oc_pack_read(_opb,8);
+  _info->version_subminor=(unsigned char)val;
+  /*verify we can parse this bitstream version.
+     We accept earlier minors and all subminors, by spec*/
+  if(_info->version_major>TH_VERSION_MAJOR||
+   _info->version_major==TH_VERSION_MAJOR&&
+   _info->version_minor>TH_VERSION_MINOR){
+    return TH_EVERSION;
+  }
+  /*Read the encoded frame description.*/
+  val=oc_pack_read(_opb,16);
+  _info->frame_width=(ogg_uint32_t)val<<4;
+  val=oc_pack_read(_opb,16);
+  _info->frame_height=(ogg_uint32_t)val<<4;
+  val=oc_pack_read(_opb,24);
+  _info->pic_width=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,24);
+  _info->pic_height=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->pic_x=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->pic_y=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,32);
+  _info->fps_numerator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,32);
+  _info->fps_denominator=(ogg_uint32_t)val;
+  if(_info->frame_width==0||_info->frame_height==0||
+   _info->pic_width+_info->pic_x>_info->frame_width||
+   _info->pic_height+_info->pic_y>_info->frame_height||
+   _info->fps_numerator==0||_info->fps_denominator==0){
+    return TH_EBADHEADER;
+  }
+  /*Note: The sense of pic_y is inverted in what we pass back to the
+     application compared to how it is stored in the bitstream.
+    This is because the bitstream uses a right-handed coordinate system, while
+     applications expect a left-handed one.*/
+  _info->pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
+  val=oc_pack_read(_opb,24);
+  _info->aspect_numerator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,24);
+  _info->aspect_denominator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->colorspace=(th_colorspace)val;
+  val=oc_pack_read(_opb,24);
+  _info->target_bitrate=(int)val;
+  val=oc_pack_read(_opb,6);
+  _info->quality=(int)val;
+  val=oc_pack_read(_opb,5);
+  _info->keyframe_granule_shift=(int)val;
+  val=oc_pack_read(_opb,2);
+  _info->pixel_fmt=(th_pixel_fmt)val;
+  if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER;
+  val=oc_pack_read(_opb,3);
+  if(val!=0||oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+  return 0;
+}
+
+static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
+  long len;
+  int  i;
+  /*Read the vendor string.*/
+  len=oc_unpack_length(_opb);
+  if(len<0||len>oc_pack_bytes_left(_opb))return TH_EBADHEADER;
+  _tc->vendor=_ogg_malloc((size_t)len+1);
+  if(_tc->vendor==NULL)return TH_EFAULT;
+  oc_unpack_octets(_opb,_tc->vendor,len);
+  _tc->vendor[len]='\0';
+  /*Read the user comments.*/
+  _tc->comments=(int)oc_unpack_length(_opb);
+  len=_tc->comments;
+  if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){
+    _tc->comments=0;
+    return TH_EBADHEADER;
+  }
+  _tc->comment_lengths=(int *)_ogg_malloc(
+   _tc->comments*sizeof(_tc->comment_lengths[0]));
+  _tc->user_comments=(char **)_ogg_malloc(
+   _tc->comments*sizeof(_tc->user_comments[0]));
+  if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
+    _tc->comments=0;
+    return TH_EFAULT;
+  }
+  for(i=0;i<_tc->comments;i++){
+    len=oc_unpack_length(_opb);
+    if(len<0||len>oc_pack_bytes_left(_opb)){
+      _tc->comments=i;
+      return TH_EBADHEADER;
+    }
+    _tc->comment_lengths[i]=len;
+    _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
+    if(_tc->user_comments[i]==NULL){
+      _tc->comments=i;
+      return TH_EFAULT;
+    }
+    oc_unpack_octets(_opb,_tc->user_comments[i],len);
+    _tc->user_comments[i][len]='\0';
+  }
+  return oc_pack_bytes_left(_opb)<0?TH_EBADHEADER:0;
+}
+
+static int oc_setup_unpack(oc_pack_buf *_opb,th_setup_info *_setup){
+  int ret;
+  /*Read the quantizer tables.*/
+  ret=oc_quant_params_unpack(_opb,&_setup->qinfo);
+  if(ret<0)return ret;
+  /*Read the Huffman trees.*/
+  return oc_huff_trees_unpack(_opb,_setup->huff_tables);
+}
+
+static void oc_setup_clear(th_setup_info *_setup){
+  oc_quant_params_clear(&_setup->qinfo);
+  oc_huff_trees_clear(_setup->huff_tables);
+}
+
+static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
+ th_comment *_tc,th_setup_info **_setup,ogg_packet *_op){
+  char buffer[6];
+  long val;
+  int  packtype;
+  int  ret;
+  val=oc_pack_read(_opb,8);
+  packtype=(int)val;
+  /*If we're at a data packet and we have received all three headers, we're
+     done.*/
+  if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){
+    return 0;
+  }
+  /*Check the codec string.*/
+  oc_unpack_octets(_opb,buffer,6);
+  if(memcmp(buffer,"theora",6)!=0)return TH_ENOTFORMAT;
+  switch(packtype){
+    /*Codec info header.*/
+    case 0x80:{
+      /*This should be the first packet, and we should not already be
+         initialized.*/
+      if(!_op->b_o_s||_info->frame_width>0)return TH_EBADHEADER;
+      ret=oc_info_unpack(_opb,_info);
+      if(ret<0)th_info_clear(_info);
+      else ret=3;
+    }break;
+    /*Comment header.*/
+    case 0x81:{
+      if(_tc==NULL)return TH_EFAULT;
+      /*We shoud have already decoded the info header, and should not yet have
+         decoded the comment header.*/
+      if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER;
+      ret=oc_comment_unpack(_opb,_tc);
+      if(ret<0)th_comment_clear(_tc);
+      else ret=2;
+    }break;
+    /*Codec setup header.*/
+    case 0x82:{
+      oc_setup_info *setup;
+      if(_tc==NULL||_setup==NULL)return TH_EFAULT;
+      /*We should have already decoded the info header and the comment header,
+         and should not yet have decoded the setup header.*/
+      if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){
+        return TH_EBADHEADER;
+      }
+      setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup));
+      if(setup==NULL)return TH_EFAULT;
+      ret=oc_setup_unpack(_opb,setup);
+      if(ret<0){
+        oc_setup_clear(setup);
+        _ogg_free(setup);
+      }
+      else{
+        *_setup=setup;
+        ret=1;
+      }
+    }break;
+    default:{
+      /*We don't know what this header is.*/
+      return TH_EBADHEADER;
+    }break;
+  }
+  return ret;
+}
+
+
+/*Decodes one header packet.
+  This should be called repeatedly with the packets at the beginning of the
+   stream until it returns 0.*/
+int th_decode_headerin(th_info *_info,th_comment *_tc,
+ th_setup_info **_setup,ogg_packet *_op){
+  oc_pack_buf opb;
+  if(_op==NULL)return TH_EBADHEADER;
+  if(_info==NULL)return TH_EFAULT;
+  oc_pack_readinit(&opb,_op->packet,_op->bytes);
+  return oc_dec_headerin(&opb,_info,_tc,_setup,_op);
+}
+
+void th_setup_free(th_setup_info *_setup){
+  if(_setup!=NULL){
+    oc_setup_clear(_setup);
+    _ogg_free(_setup);
+  }
+}

+ 186 - 0
modules/theoraplayer/native/theora/lib/decint.h

@@ -0,0 +1,186 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $
+
+ ********************************************************************/
+
+#include <limits.h>
+#if !defined(_decint_H)
+# define _decint_H (1)
+# include "theora/theoradec.h"
+# include "state.h"
+# include "bitpack.h"
+# include "huffdec.h"
+# include "dequant.h"
+
+typedef struct th_setup_info         oc_setup_info;
+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
+typedef struct th_dec_ctx            oc_dec_ctx;
+
+
+
+/*Decoder-specific accelerated functions.*/
+# if defined(OC_C64X_ASM)
+#  include "c64x/c64xdec.h"
+# endif
+
+# if !defined(oc_dec_accel_init)
+#  define oc_dec_accel_init oc_dec_accel_init_c
+# endif
+# if defined(OC_DEC_USE_VTABLE)
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
+#  endif
+# else
+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
+#  endif
+# endif
+
+
+
+/*Constants for the packet-in state machine specific to the decoder.*/
+
+/*Next packet to read: Data packet.*/
+#define OC_PACKET_DATA (0)
+
+
+
+struct th_setup_info{
+  /*The Huffman codes.*/
+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
+  /*The quantization parameters.*/
+  th_quant_info  qinfo;
+};
+
+
+
+/*Decoder specific functions with accelerated variants.*/
+struct oc_dec_opt_vtable{
+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
+   oc_dec_pipeline_state *_pipe,int _pli);
+};
+
+
+
+struct oc_dec_pipeline_state{
+  /*Decoded DCT coefficients.
+    These are placed here instead of on the stack so that they can persist
+     between blocks, which makes clearing them back to zero much faster when
+     only a few non-zero coefficients were decoded.
+    It requires at least 65 elements because the zig-zag index array uses the
+     65th element as a dumping ground for out-of-range indices to protect us
+     from buffer overflow.
+    We make it fully twice as large so that the second half can serve as the
+     reconstruction buffer, which saves passing another parameter to all the
+     acceleration functios.
+    It also solves problems with 16-byte alignment for NEON on ARM.
+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+     alignment, and silently produces incorrect results if you ask for 16.
+    Finally, keeping it off the stack means there's less likely to be a data
+     hazard beween the NEON co-processor and the regular ARM core, which avoids
+     unnecessary stalls.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][4];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+};
+
+
+struct th_dec_ctx{
+  /*Shared encoder/decoder state.*/
+  oc_theora_state        state;
+  /*Whether or not packets are ready to be emitted.
+    This takes on negative values while there are remaining header packets to
+     be emitted, reaches 0 when the codec is ready for input, and goes to 1
+     when a frame has been processed and a data packet is ready.*/
+  int                    packet_state;
+  /*Buffer in which to assemble packets.*/
+  oc_pack_buf            opb;
+  /*Huffman decode trees.*/
+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
+  /*The index of the first token in each plane for each coefficient.*/
+  ptrdiff_t              ti0[3][64];
+  /*The number of outstanding EOB runs at the start of each coefficient in each
+     plane.*/
+  ptrdiff_t              eob_runs[3][64];
+  /*The DCT token lists.*/
+  unsigned char         *dct_tokens;
+  /*The extra bits associated with DCT tokens.*/
+  unsigned char         *extra_bits;
+  /*The number of dct tokens unpacked so far.*/
+  int                    dct_tokens_count;
+  /*The out-of-loop post-processing level.*/
+  int                    pp_level;
+  /*The DC scale used for out-of-loop deblocking.*/
+  int                    pp_dc_scale[64];
+  /*The sharpen modifier used for out-of-loop deringing.*/
+  int                    pp_sharp_mod[64];
+  /*The DC quantization index of each block.*/
+  unsigned char         *dc_qis;
+  /*The variance of each block.*/
+  int                   *variances;
+  /*The storage for the post-processed frame buffer.*/
+  unsigned char         *pp_frame_data;
+  /*Whether or not the post-processsed frame buffer has space for chroma.*/
+  int                    pp_frame_state;
+  /*The buffer used for the post-processed frame.
+    Note that this is _not_ guaranteed to have the same strides and offsets as
+     the reference frame buffers.*/
+  th_ycbcr_buffer        pp_frame_buf;
+  /*The striped decode callback function.*/
+  th_stripe_callback     stripe_cb;
+  oc_dec_pipeline_state  pipe;
+# if defined(OC_DEC_USE_VTABLE)
+  /*Table for decoder acceleration functions.*/
+  oc_dec_opt_vtable      opt_vtable;
+# endif
+# if defined(HAVE_CAIRO)
+  /*Output metrics for debugging.*/
+  int                    telemetry;
+  int                    telemetry_mbmode;
+  int                    telemetry_mv;
+  int                    telemetry_qi;
+  int                    telemetry_bits;
+  int                    telemetry_frame_bytes;
+  int                    telemetry_coding_bytes;
+  int                    telemetry_mode_bytes;
+  int                    telemetry_mv_bytes;
+  int                    telemetry_qi_bytes;
+  int                    telemetry_dc_bytes;
+  unsigned char         *telemetry_frame_data;
+# endif
+};
+
+/*Default pure-C implementations of decoder-specific accelerated functions.*/
+void oc_dec_accel_init_c(oc_dec_ctx *_dec);
+
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli);
+
+#endif

+ 2992 - 0
modules/theoraplayer/native/theora/lib/decode.c

@@ -0,0 +1,2992 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decode.c 18268 2012-05-08 02:51:57Z tterribe $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "decint.h"
+#if defined(OC_DUMP_IMAGES)
+# include <stdio.h>
+# include "png.h"
+#endif
+#if defined(HAVE_CAIRO)
+# include <cairo.h>
+#endif
+
+
+/*No post-processing.*/
+#define OC_PP_LEVEL_DISABLED  (0)
+/*Keep track of DC qi for each block only.*/
+#define OC_PP_LEVEL_TRACKDCQI (1)
+/*Deblock the luma plane.*/
+#define OC_PP_LEVEL_DEBLOCKY  (2)
+/*Dering the luma plane.*/
+#define OC_PP_LEVEL_DERINGY   (3)
+/*Stronger luma plane deringing.*/
+#define OC_PP_LEVEL_SDERINGY  (4)
+/*Deblock the chroma planes.*/
+#define OC_PP_LEVEL_DEBLOCKC  (5)
+/*Dering the chroma planes.*/
+#define OC_PP_LEVEL_DERINGC   (6)
+/*Stronger chroma plane deringing.*/
+#define OC_PP_LEVEL_SDERINGC  (7)
+/*Maximum valid post-processing level.*/
+#define OC_PP_LEVEL_MAX       (7)
+
+
+
+/*The mode alphabets for the various mode coding schemes.
+  Scheme 0 uses a custom alphabet, which is not stored in this table.*/
+static const unsigned char OC_MODE_ALPHABETS[7][OC_NMODES]={
+  /*Last MV dominates */
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_NOMV,
+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST2,
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_NOMV,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,
+    OC_MODE_GOLDEN_MV,OC_MODE_INTER_MV_FOUR
+  },
+  /*No MV dominates.*/
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,
+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_GOLDEN_NOMV,OC_MODE_INTER_MV_LAST,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  /*Default ordering.*/
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  }
+};
+
+
+/*The original DCT tokens are extended and reordered during the construction of
+   the Huffman tables.
+  The extension means more bits can be read with fewer calls to the bitpacker
+   during the Huffman decoding process (at the cost of larger Huffman tables),
+   and fewer tokens require additional extra bits (reducing the average storage
+   per decoded token).
+  The revised ordering reveals essential information in the token value
+   itself; specifically, whether or not there are additional extra bits to read
+   and the parameter to which those extra bits are applied.
+  The token is used to fetch a code word from the OC_DCT_CODE_WORD table below.
+  The extra bits are added into code word at the bit position inferred from the
+   token value, giving the final code word from which all required parameters
+   are derived.
+  The number of EOBs and the leading zero run length can be extracted directly.
+  The coefficient magnitude is optionally negated before extraction, according
+   to a 'flip' bit.*/
+
+/*The number of additional extra bits that are decoded with each of the
+   internal DCT tokens.*/
+static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
+  12,4,3,3,4,4,5,5,8,8,8,8,3,3,6
+};
+
+/*Whether or not an internal token needs any additional extra bits.*/
+#define OC_DCT_TOKEN_NEEDS_MORE(token) \
+ (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
+  sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
+
+/*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
+#define OC_DCT_TOKEN_FAT_EOB (0)
+
+/*The number of EOBs to use for an end-of-frame token.
+  Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99, which
+   is not yet available everywhere; this should be equivalent.*/
+#define OC_DCT_EOB_FINISH (~(size_t)0>>1)
+
+/*The location of the (6) run length bits in the code word.
+  These are placed at index 0 and given 8 bits (even though 6 would suffice)
+   because it may be faster to extract the lower byte on some platforms.*/
+#define OC_DCT_CW_RLEN_SHIFT (0)
+/*The location of the (12) EOB bits in the code word.*/
+#define OC_DCT_CW_EOB_SHIFT  (8)
+/*The location of the (1) flip bit in the code word.
+  This must be right under the magnitude bits.*/
+#define OC_DCT_CW_FLIP_BIT   (20)
+/*The location of the (11) token magnitude bits in the code word.
+  These must be last, and rely on a sign-extending right shift.*/
+#define OC_DCT_CW_MAG_SHIFT  (21)
+
+/*Pack the given fields into a code word.*/
+#define OC_DCT_CW_PACK(_eobs,_rlen,_mag,_flip) \
+ ((_eobs)<<OC_DCT_CW_EOB_SHIFT| \
+ (_rlen)<<OC_DCT_CW_RLEN_SHIFT| \
+ (_flip)<<OC_DCT_CW_FLIP_BIT| \
+ (_mag)-(_flip)<<OC_DCT_CW_MAG_SHIFT)
+
+/*A special code word value that signals the end of the frame (a long EOB run
+   of zero).*/
+#define OC_DCT_CW_FINISH (0)
+
+/*The position at which to insert the extra bits in the code word.
+  We use this formulation because Intel has no useful cmov.
+  A real architecture would probably do better with two of those.
+  This translates to 11 instructions(!), and is _still_ faster than either a
+   table lookup (just barely) or the naive double-ternary implementation (which
+   gcc translates to a jump and a cmov).
+  This assumes OC_DCT_CW_RLEN_SHIFT is zero, but could easily be reworked if
+   you want to make one of the other shifts zero.*/
+#define OC_DCT_TOKEN_EB_POS(_token) \
+ ((OC_DCT_CW_EOB_SHIFT-OC_DCT_CW_MAG_SHIFT&-((_token)<2)) \
+ +(OC_DCT_CW_MAG_SHIFT&-((_token)<12)))
+
+/*The code words for each internal token.
+  See the notes at OC_DCT_TOKEN_MAP for the reasons why things are out of
+   order.*/
+static const ogg_int32_t OC_DCT_CODE_WORD[92]={
+  /*These tokens require additional extra bits for the EOB count.*/
+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
+  OC_DCT_CW_FINISH,
+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
+  OC_DCT_CW_PACK(16, 0,  0,0),
+  /*These tokens require additional extra bits for the magnitude.*/
+  /*OC_DCT_VAL_CAT5 (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 13,0),
+  OC_DCT_CW_PACK( 0, 0, 13,1),
+  /*OC_DCT_VAL_CAT6 (5 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 21,0),
+  OC_DCT_CW_PACK( 0, 0, 21,1),
+  /*OC_DCT_VAL_CAT7 (6 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 37,0),
+  OC_DCT_CW_PACK( 0, 0, 37,1),
+  /*OC_DCT_VAL_CAT8 (10 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 69,0),
+  OC_DCT_CW_PACK( 0, 0,325,0),
+  OC_DCT_CW_PACK( 0, 0, 69,1),
+  OC_DCT_CW_PACK( 0, 0,325,1),
+  /*These tokens require additional extra bits for the run length.*/
+  /*OC_DCT_RUN_CAT1C (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0,10, +1,0),
+  OC_DCT_CW_PACK( 0,10, -1,0),
+  /*OC_DCT_ZRL_TOKEN (6 extra bits)
+    Flip is set to distinguish this from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  /*The remaining tokens require no additional extra bits.*/
+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 1, 0,  0,0),
+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 2, 0,  0,0),
+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 3, 0,  0,0),
+  /*OC_DCT_RUN_CAT1A (1 extra bit-1 already read)x5*/
+  OC_DCT_CW_PACK( 0, 1, +1,0),
+  OC_DCT_CW_PACK( 0, 1, -1,0),
+  OC_DCT_CW_PACK( 0, 2, +1,0),
+  OC_DCT_CW_PACK( 0, 2, -1,0),
+  OC_DCT_CW_PACK( 0, 3, +1,0),
+  OC_DCT_CW_PACK( 0, 3, -1,0),
+  OC_DCT_CW_PACK( 0, 4, +1,0),
+  OC_DCT_CW_PACK( 0, 4, -1,0),
+  OC_DCT_CW_PACK( 0, 5, +1,0),
+  OC_DCT_CW_PACK( 0, 5, -1,0),
+  /*OC_DCT_RUN_CAT2A (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 1, +2,0),
+  OC_DCT_CW_PACK( 0, 1, +3,0),
+  OC_DCT_CW_PACK( 0, 1, -2,0),
+  OC_DCT_CW_PACK( 0, 1, -3,0),
+  /*OC_DCT_RUN_CAT1B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 6, +1,0),
+  OC_DCT_CW_PACK( 0, 7, +1,0),
+  OC_DCT_CW_PACK( 0, 8, +1,0),
+  OC_DCT_CW_PACK( 0, 9, +1,0),
+  OC_DCT_CW_PACK( 0, 6, -1,0),
+  OC_DCT_CW_PACK( 0, 7, -1,0),
+  OC_DCT_CW_PACK( 0, 8, -1,0),
+  OC_DCT_CW_PACK( 0, 9, -1,0),
+  /*OC_DCT_RUN_CAT2B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 2, +2,0),
+  OC_DCT_CW_PACK( 0, 3, +2,0),
+  OC_DCT_CW_PACK( 0, 2, +3,0),
+  OC_DCT_CW_PACK( 0, 3, +3,0),
+  OC_DCT_CW_PACK( 0, 2, -2,0),
+  OC_DCT_CW_PACK( 0, 3, -2,0),
+  OC_DCT_CW_PACK( 0, 2, -3,0),
+  OC_DCT_CW_PACK( 0, 3, -3,0),
+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits-3 already read)
+    Flip is set on the first one to distinguish it from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  OC_DCT_CW_PACK( 0, 1,  0,0),
+  OC_DCT_CW_PACK( 0, 2,  0,0),
+  OC_DCT_CW_PACK( 0, 3,  0,0),
+  OC_DCT_CW_PACK( 0, 4,  0,0),
+  OC_DCT_CW_PACK( 0, 5,  0,0),
+  OC_DCT_CW_PACK( 0, 6,  0,0),
+  OC_DCT_CW_PACK( 0, 7,  0,0),
+  /*OC_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +1,0),
+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -1,0),
+  /*OC_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +2,0),
+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -2,0),
+  /*OC_DCT_VAL_CAT2 (1 extra bit-1 already read)x4*/
+  OC_DCT_CW_PACK( 0, 0, +3,0),
+  OC_DCT_CW_PACK( 0, 0, -3,0),
+  OC_DCT_CW_PACK( 0, 0, +4,0),
+  OC_DCT_CW_PACK( 0, 0, -4,0),
+  OC_DCT_CW_PACK( 0, 0, +5,0),
+  OC_DCT_CW_PACK( 0, 0, -5,0),
+  OC_DCT_CW_PACK( 0, 0, +6,0),
+  OC_DCT_CW_PACK( 0, 0, -6,0),
+  /*OC_DCT_VAL_CAT3 (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +7,0),
+  OC_DCT_CW_PACK( 0, 0, +8,0),
+  OC_DCT_CW_PACK( 0, 0, -7,0),
+  OC_DCT_CW_PACK( 0, 0, -8,0),
+  /*OC_DCT_VAL_CAT4 (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +9,0),
+  OC_DCT_CW_PACK( 0, 0,+10,0),
+  OC_DCT_CW_PACK( 0, 0,+11,0),
+  OC_DCT_CW_PACK( 0, 0,+12,0),
+  OC_DCT_CW_PACK( 0, 0, -9,0),
+  OC_DCT_CW_PACK( 0, 0,-10,0),
+  OC_DCT_CW_PACK( 0, 0,-11,0),
+  OC_DCT_CW_PACK( 0, 0,-12,0),
+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 8, 0,  0,0),
+  OC_DCT_CW_PACK( 9, 0,  0,0),
+  OC_DCT_CW_PACK(10, 0,  0,0),
+  OC_DCT_CW_PACK(11, 0,  0,0),
+  OC_DCT_CW_PACK(12, 0,  0,0),
+  OC_DCT_CW_PACK(13, 0,  0,0),
+  OC_DCT_CW_PACK(14, 0,  0,0),
+  OC_DCT_CW_PACK(15, 0,  0,0),
+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 4, 0,  0,0),
+  OC_DCT_CW_PACK( 5, 0,  0,0),
+  OC_DCT_CW_PACK( 6, 0,  0,0),
+  OC_DCT_CW_PACK( 7, 0,  0,0),
+};
+
+
+
+static int oc_sb_run_unpack(oc_pack_buf *_opb){
+  /*Coding scheme:
+       Codeword            Run Length
+     0                       1
+     10x                     2-3
+     110x                    4-5
+     1110xx                  6-9
+     11110xxx                10-17
+     111110xxxx              18-33
+     111111xxxxxxxxxxxx      34-4129*/
+  static const ogg_int16_t OC_SB_RUN_TREE[22]={
+    4,
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
+     -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3),
+     -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17,
+      2,
+       -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6)
+  };
+  int ret;
+  ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE);
+  if(ret>=0x10){
+    int offs;
+    offs=ret&0x1F;
+    ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4);
+  }
+  return ret;
+}
+
+static int oc_block_run_unpack(oc_pack_buf *_opb){
+  /*Coding scheme:
+     Codeword             Run Length
+     0x                      1-2
+     10x                     3-4
+     110x                    5-6
+     1110xx                  7-10
+     11110xx                 11-14
+     11111xxxx               15-30*/
+  static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={
+    5,
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
+     -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3),
+     -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4),
+     -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6),
+     33,       36,       39,       44,
+      1,-(1<<8|7),-(1<<8|8),
+      1,-(1<<8|9),-(1<<8|10),
+      2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14),
+      4,
+       -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18),
+       -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22),
+       -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26),
+       -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30)
+  };
+  return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE);
+}
+
+
+
+void oc_dec_accel_init_c(oc_dec_ctx *_dec){
+# if defined(OC_DEC_USE_VTABLE)
+  _dec->opt_vtable.dc_unpredict_mcu_plane=
+   oc_dec_dc_unpredict_mcu_plane_c;
+# endif
+}
+
+static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
+ const th_setup_info *_setup){
+  int qti;
+  int pli;
+  int qi;
+  int ret;
+  ret=oc_state_init(&_dec->state,_info,3);
+  if(ret<0)return ret;
+  ret=oc_huff_trees_copy(_dec->huff_tables,
+   (const ogg_int16_t *const *)_setup->huff_tables);
+  if(ret<0){
+    oc_state_clear(&_dec->state);
+    return ret;
+  }
+  /*For each fragment, allocate one byte for every DCT coefficient token, plus
+     one byte for extra-bits for each token, plus one more byte for the long
+     EOB run, just in case it's the very last token and has a run length of
+     one.*/
+  _dec->dct_tokens=(unsigned char *)_ogg_malloc((64+64+1)*
+   _dec->state.nfrags*sizeof(_dec->dct_tokens[0]));
+  if(_dec->dct_tokens==NULL){
+    oc_huff_trees_clear(_dec->huff_tables);
+    oc_state_clear(&_dec->state);
+    return TH_EFAULT;
+  }
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _dec->state.dequant_tables[qi][pli][qti]=
+     _dec->state.dequant_table_data[qi][pli][qti];
+  }
+  oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
+   &_setup->qinfo);
+  for(qi=0;qi<64;qi++){
+    int qsum;
+    qsum=0;
+    for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+      qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+
+       _dec->state.dequant_tables[qi][pli][qti][17]+
+       _dec->state.dequant_tables[qi][pli][qti][18]+
+       _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0);
+    }
+    _dec->pp_sharp_mod[qi]=-(qsum>>11);
+  }
+  memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
+   sizeof(_dec->state.loop_filter_limits));
+  oc_dec_accel_init(_dec);
+  _dec->pp_level=OC_PP_LEVEL_DISABLED;
+  _dec->dc_qis=NULL;
+  _dec->variances=NULL;
+  _dec->pp_frame_data=NULL;
+  _dec->stripe_cb.ctx=NULL;
+  _dec->stripe_cb.stripe_decoded=NULL;
+#if defined(HAVE_CAIRO)
+  _dec->telemetry=0;
+  _dec->telemetry_bits=0;
+  _dec->telemetry_qi=0;
+  _dec->telemetry_mbmode=0;
+  _dec->telemetry_mv=0;
+  _dec->telemetry_frame_data=NULL;
+#endif
+  return 0;
+}
+
+static void oc_dec_clear(oc_dec_ctx *_dec){
+#if defined(HAVE_CAIRO)
+  _ogg_free(_dec->telemetry_frame_data);
+#endif
+  _ogg_free(_dec->pp_frame_data);
+  _ogg_free(_dec->variances);
+  _ogg_free(_dec->dc_qis);
+  _ogg_free(_dec->dct_tokens);
+  oc_huff_trees_clear(_dec->huff_tables);
+  oc_state_clear(&_dec->state);
+}
+
+
+static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
+  long val;
+  /*Check to make sure this is a data packet.*/
+  val=oc_pack_read1(&_dec->opb);
+  if(val!=0)return TH_EBADPACKET;
+  /*Read in the frame type (I or P).*/
+  val=oc_pack_read1(&_dec->opb);
+  _dec->state.frame_type=(int)val;
+  /*Read in the qi list.*/
+  val=oc_pack_read(&_dec->opb,6);
+  _dec->state.qis[0]=(unsigned char)val;
+  val=oc_pack_read1(&_dec->opb);
+  if(!val)_dec->state.nqis=1;
+  else{
+    val=oc_pack_read(&_dec->opb,6);
+    _dec->state.qis[1]=(unsigned char)val;
+    val=oc_pack_read1(&_dec->opb);
+    if(!val)_dec->state.nqis=2;
+    else{
+      val=oc_pack_read(&_dec->opb,6);
+      _dec->state.qis[2]=(unsigned char)val;
+      _dec->state.nqis=3;
+    }
+  }
+  if(_dec->state.frame_type==OC_INTRA_FRAME){
+    /*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
+      Most of the other unused bits in the VP3 headers were eliminated.
+      I don't know why these remain.*/
+    /*I wanted to eliminate wasted bits, but not all config wiggle room
+       --Monty.*/
+    val=oc_pack_read(&_dec->opb,3);
+    if(val!=0)return TH_EIMPL;
+  }
+  return 0;
+}
+
+/*Mark all fragments as coded and in OC_MODE_INTRA.
+  This also builds up the coded fragment list (in coded order), and clears the
+   uncoded fragment list.
+  It does not update the coded macro block list nor the super block flags, as
+   those are not used when decoding INTRA frames.*/
+static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  oc_fragment       *frags;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          prev_ncoded_fragis;
+  unsigned           nsbs;
+  unsigned           sbi;
+  int                pli;
+  coded_fragis=_dec->state.coded_fragis;
+  prev_ncoded_fragis=ncoded_fragis=0;
+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
+  sb_flags=_dec->state.sb_flags;
+  frags=_dec->state.frags;
+  sbi=nsbs=0;
+  for(pli=0;pli<3;pli++){
+    nsbs+=_dec->state.fplanes[pli].nsbs;
+    for(;sbi<nsbs;sbi++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int bi;
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          fragi=sb_maps[sbi][quadi][bi];
+          if(fragi>=0){
+            frags[fragi].coded=1;
+            frags[fragi].refi=OC_FRAME_SELF;
+            frags[fragi].mb_mode=OC_MODE_INTRA;
+            coded_fragis[ncoded_fragis++]=fragi;
+          }
+        }
+      }
+    }
+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
+    prev_ncoded_fragis=ncoded_fragis;
+  }
+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
+}
+
+/*Decodes the bit flags indicating whether each super block is partially coded
+   or not.
+  Return: The number of partially coded super blocks.*/
+static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
+  oc_sb_flags *sb_flags;
+  unsigned     nsbs;
+  unsigned     sbi;
+  unsigned     npartial;
+  unsigned     run_count;
+  long         val;
+  int          flag;
+  val=oc_pack_read1(&_dec->opb);
+  flag=(int)val;
+  sb_flags=_dec->state.sb_flags;
+  nsbs=_dec->state.nsbs;
+  sbi=npartial=0;
+  while(sbi<nsbs){
+    int full_run;
+    run_count=oc_sb_run_unpack(&_dec->opb);
+    full_run=run_count>=4129;
+    do{
+      sb_flags[sbi].coded_partially=flag;
+      sb_flags[sbi].coded_fully=0;
+      npartial+=flag;
+      sbi++;
+    }
+    while(--run_count>0&&sbi<nsbs);
+    if(full_run&&sbi<nsbs){
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+    }
+    else flag=!flag;
+  }
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+  return npartial;
+}
+
+/*Decodes the bit flags for whether or not each non-partially-coded super
+   block is fully coded or not.
+  This function should only be called if there is at least one
+   non-partially-coded super block.
+  Return: The number of partially coded super blocks.*/
+static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){
+  oc_sb_flags *sb_flags;
+  unsigned     nsbs;
+  unsigned     sbi;
+  unsigned     run_count;
+  long         val;
+  int          flag;
+  sb_flags=_dec->state.sb_flags;
+  nsbs=_dec->state.nsbs;
+  /*Skip partially coded super blocks.*/
+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
+  val=oc_pack_read1(&_dec->opb);
+  flag=(int)val;
+  do{
+    int full_run;
+    run_count=oc_sb_run_unpack(&_dec->opb);
+    full_run=run_count>=4129;
+    for(;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially)continue;
+      if(run_count--<=0)break;
+      sb_flags[sbi].coded_fully=flag;
+    }
+    if(full_run&&sbi<nsbs){
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+    }
+    else flag=!flag;
+  }
+  while(sbi<nsbs);
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+}
+
+static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  signed char       *mb_modes;
+  oc_fragment       *frags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  unsigned           npartial;
+  long               val;
+  int                pli;
+  int                flag;
+  int                run_count;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t         *uncoded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          nuncoded_fragis;
+  ptrdiff_t          prev_ncoded_fragis;
+  npartial=oc_dec_partial_sb_flags_unpack(_dec);
+  if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
+  if(npartial>0){
+    val=oc_pack_read1(&_dec->opb);
+    flag=!(int)val;
+  }
+  else flag=0;
+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
+  sb_flags=_dec->state.sb_flags;
+  mb_modes=_dec->state.mb_modes;
+  frags=_dec->state.frags;
+  sbi=nsbs=run_count=0;
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
+  prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
+  for(pli=0;pli<3;pli++){
+    nsbs+=_dec->state.fplanes[pli].nsbs;
+    for(;sbi<nsbs;sbi++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int quad_coded;
+        int bi;
+        quad_coded=0;
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          fragi=sb_maps[sbi][quadi][bi];
+          if(fragi>=0){
+            int coded;
+            if(sb_flags[sbi].coded_fully)coded=1;
+            else if(!sb_flags[sbi].coded_partially)coded=0;
+            else{
+              if(run_count<=0){
+                run_count=oc_block_run_unpack(&_dec->opb);
+                flag=!flag;
+              }
+              run_count--;
+              coded=flag;
+            }
+            if(coded)coded_fragis[ncoded_fragis++]=fragi;
+            else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            quad_coded|=coded;
+            frags[fragi].coded=coded;
+            frags[fragi].refi=OC_FRAME_NONE;
+          }
+        }
+        /*Remember if there's a coded luma block in this macro block.*/
+        if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
+      }
+    }
+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
+    prev_ncoded_fragis=ncoded_fragis;
+  }
+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+}
+
+
+/*Coding scheme:
+   Codeword            Mode Index
+   0                       0
+   10                      1
+   110                     2
+   1110                    3
+   11110                   4
+   111110                  5
+   1111110                 6
+   1111111                 7*/
+static const ogg_int16_t OC_VLC_MODE_TREE[26]={
+  4,
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
+   -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
+   -(3<<8|2),-(3<<8|2),-(4<<8|3),17,
+    3,
+     -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4),
+     -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7)
+};
+
+static const ogg_int16_t OC_CLC_MODE_TREE[9]={
+  3,
+   -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3),
+   -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7)
+};
+
+/*Unpacks the list of macro block modes for INTER frames.*/
+static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
+  signed char         *mb_modes;
+  const unsigned char *alphabet;
+  unsigned char        scheme0_alphabet[8];
+  const ogg_int16_t   *mode_tree;
+  size_t               nmbs;
+  size_t               mbi;
+  long                 val;
+  int                  mode_scheme;
+  val=oc_pack_read(&_dec->opb,3);
+  mode_scheme=(int)val;
+  if(mode_scheme==0){
+    int mi;
+    /*Just in case, initialize the modes to something.
+      If the bitstream doesn't contain each index exactly once, it's likely
+       corrupt and the rest of the packet is garbage anyway, but this way we
+       won't crash, and we'll decode SOMETHING.*/
+    /*LOOP VECTORIZES*/
+    for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
+    for(mi=0;mi<OC_NMODES;mi++){
+      val=oc_pack_read(&_dec->opb,3);
+      scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
+    }
+    alphabet=scheme0_alphabet;
+  }
+  else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
+  mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
+  mb_modes=_dec->state.mb_modes;
+  nmbs=_dec->state.nmbs;
+  for(mbi=0;mbi<nmbs;mbi++){
+    if(mb_modes[mbi]>0){
+      /*We have a coded luma block; decode a mode.*/
+      mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
+    }
+    /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
+       fact that OC_MODE_INTER_NOMV is already 0.*/
+  }
+}
+
+
+
+static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={
+  5,
+   -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),
+   -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),
+   -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),
+   -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2),
+   -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3),
+   33,          36,          39,          42,
+   45,          50,          55,          60,
+   65,          74,          83,          92,
+    1,-(1<<8|32+4),-(1<<8|32-4),
+    1,-(1<<8|32+5),-(1<<8|32-5),
+    1,-(1<<8|32+6),-(1<<8|32-6),
+    1,-(1<<8|32+7),-(1<<8|32-7),
+    2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9),
+    2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11),
+    2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13),
+    2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15),
+    3,
+     -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17),
+     -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19),
+    3,
+     -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21),
+     -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23),
+    3,
+     -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25),
+     -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27),
+    3,
+     -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29),
+     -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31)
+};
+
+static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={
+  6,
+   -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1),
+   -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3),
+   -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5),
+   -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7),
+   -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9),
+   -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11),
+   -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13),
+   -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15),
+   -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17),
+   -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19),
+   -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21),
+   -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23),
+   -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25),
+   -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27),
+   -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29),
+   -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31)
+};
+
+
+static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  int dx;
+  int dy;
+  dx=oc_huff_token_decode(_opb,_tree)-32;
+  dy=oc_huff_token_decode(_opb,_tree)-32;
+  return OC_MV(dx,dy);
+}
+
+/*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
+   block modes and motion vectors to the individual fragments.*/
+static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){
+  const oc_mb_map        *mb_maps;
+  const signed char      *mb_modes;
+  oc_set_chroma_mvs_func  set_chroma_mvs;
+  const ogg_int16_t      *mv_comp_tree;
+  oc_fragment            *frags;
+  oc_mv                  *frag_mvs;
+  const unsigned char    *map_idxs;
+  int                     map_nidxs;
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
+  oc_mv                   cbmvs[4];
+  size_t                  nmbs;
+  size_t                  mbi;
+  long                    val;
+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
+  val=oc_pack_read1(&_dec->opb);
+  mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE;
+  map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
+  prior_mv=last_mv=0;
+  frags=_dec->state.frags;
+  frag_mvs=_dec->state.frag_mvs;
+  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
+  mb_modes=_dec->state.mb_modes;
+  nmbs=_dec->state.nmbs;
+  for(mbi=0;mbi<nmbs;mbi++){
+    int mb_mode;
+    mb_mode=mb_modes[mbi];
+    if(mb_mode!=OC_MODE_INVALID){
+      oc_mv     mbmv;
+      ptrdiff_t fragi;
+      int       mapi;
+      int       mapii;
+      int       refi;
+      if(mb_mode==OC_MODE_INTER_MV_FOUR){
+        oc_mv lbmvs[4];
+        int   bi;
+        prior_mv=last_mv;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+            frag_mvs[fragi]=lbmvs[bi];
+          }
+          else lbmvs[bi]=0;
+        }
+        (*set_chroma_mvs)(cbmvs,lbmvs);
+        for(mapii=4;mapii<map_nidxs;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][mapi>>2][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            frag_mvs[fragi]=cbmvs[bi];
+          }
+        }
+      }
+      else{
+        switch(mb_mode){
+          case OC_MODE_INTER_MV:{
+            prior_mv=last_mv;
+            last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
+          case OC_MODE_INTER_MV_LAST2:{
+            mbmv=prior_mv;
+            prior_mv=last_mv;
+            last_mv=mbmv;
+          }break;
+          case OC_MODE_GOLDEN_MV:{
+            mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          default:mbmv=0;break;
+        }
+        /*Fill in the MVs for the fragments.*/
+        refi=OC_FRAME_FOR_MODE(mb_mode);
+        mapii=0;
+        do{
+          mapi=map_idxs[mapii];
+          fragi=mb_maps[mbi][mapi>>2][mapi&3];
+          if(frags[fragi].coded){
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mbmv;
+          }
+        }
+        while(++mapii<map_nidxs);
+      }
+    }
+  }
+}
+
+static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
+  oc_fragment     *frags;
+  const ptrdiff_t *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  ptrdiff_t        fragii;
+  ptrdiff_t        fragi;
+  ncoded_fragis=_dec->state.ntotal_coded_fragis;
+  if(ncoded_fragis<=0)return;
+  frags=_dec->state.frags;
+  coded_fragis=_dec->state.coded_fragis;
+  if(_dec->state.nqis==1){
+    /*If this frame has only a single qi value, then just use it for all coded
+       fragments.*/
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      frags[coded_fragis[fragii]].qii=0;
+    }
+  }
+  else{
+    long val;
+    int  flag;
+    int  nqi1;
+    int  run_count;
+    /*Otherwise, we decode a qi index for each fragment, using two passes of
+      the same binary RLE scheme used for super-block coded bits.
+     The first pass marks each fragment as having a qii of 0 or greater than
+      0, and the second pass (if necessary), distinguishes between a qii of
+      1 and 2.
+     At first we just store the qii in the fragment.
+     After all the qii's are decoded, we make a final pass to replace them
+      with the corresponding qi's for this frame.*/
+    val=oc_pack_read1(&_dec->opb);
+    flag=(int)val;
+    nqi1=0;
+    fragii=0;
+    while(fragii<ncoded_fragis){
+      int full_run;
+      run_count=oc_sb_run_unpack(&_dec->opb);
+      full_run=run_count>=4129;
+      do{
+        frags[coded_fragis[fragii++]].qii=flag;
+        nqi1+=flag;
+      }
+      while(--run_count>0&&fragii<ncoded_fragis);
+      if(full_run&&fragii<ncoded_fragis){
+        val=oc_pack_read1(&_dec->opb);
+        flag=(int)val;
+      }
+      else flag=!flag;
+    }
+    /*TODO: run_count should be 0 here.
+      If it's not, we should issue a warning of some kind.*/
+    /*If we have 3 different qi's for this frame, and there was at least one
+       fragment with a non-zero qi, make the second pass.*/
+    if(_dec->state.nqis==3&&nqi1>0){
+      /*Skip qii==0 fragments.*/
+      for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++);
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+      do{
+        int full_run;
+        run_count=oc_sb_run_unpack(&_dec->opb);
+        full_run=run_count>=4129;
+        for(;fragii<ncoded_fragis;fragii++){
+          fragi=coded_fragis[fragii];
+          if(frags[fragi].qii==0)continue;
+          if(run_count--<=0)break;
+          frags[fragi].qii+=flag;
+        }
+        if(full_run&&fragii<ncoded_fragis){
+          val=oc_pack_read1(&_dec->opb);
+          flag=(int)val;
+        }
+        else flag=!flag;
+      }
+      while(fragii<ncoded_fragis);
+      /*TODO: run_count should be 0 here.
+        If it's not, we should issue a warning of some kind.*/
+    }
+  }
+}
+
+
+
+/*Unpacks the DC coefficient tokens.
+  Unlike when unpacking the AC coefficient tokens, we actually need to decode
+   the DC coefficient values now so that we can do DC prediction.
+  _huff_idx:   The index of the Huffman table to use for each color plane.
+  _ntoks_left: The number of tokens left to be decoded in each color plane for
+                each coefficient.
+               This is updated as EOB tokens and zero run tokens are decoded.
+  Return: The length of any outstanding EOB run.*/
+static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2],
+ ptrdiff_t _ntoks_left[3][64]){
+  unsigned char   *dct_tokens;
+  oc_fragment     *frags;
+  const ptrdiff_t *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  ptrdiff_t        fragii;
+  ptrdiff_t        eobs;
+  ptrdiff_t        ti;
+  int              pli;
+  dct_tokens=_dec->dct_tokens;
+  frags=_dec->state.frags;
+  coded_fragis=_dec->state.coded_fragis;
+  ncoded_fragis=fragii=eobs=ti=0;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t eob_count;
+    ptrdiff_t eobi;
+    int       rli;
+    ncoded_fragis+=_dec->state.ncoded_fragis[pli];
+    memset(run_counts,0,sizeof(run_counts));
+    _dec->eob_runs[pli][0]=eobs;
+    _dec->ti0[pli][0]=ti;
+    /*Continue any previous EOB run, if there was one.*/
+    eobi=eobs;
+    if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+    eob_count=eobi;
+    eobs-=eobi;
+    while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+    while(fragii<ncoded_fragis){
+      int token;
+      int cw;
+      int eb;
+      int skip;
+      token=oc_huff_token_decode(&_dec->opb,
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
+      }
+      else eb=0;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+      if(cw==OC_DCT_CW_FINISH)eobs=OC_DCT_EOB_FINISH;
+      if(eobs){
+        eobi=OC_MINI(eobs,ncoded_fragis-fragii);
+        eob_count+=eobi;
+        eobs-=eobi;
+        while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+      }
+      else{
+        int coeff;
+        skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        if(skip)coeff=0;
+        run_counts[skip]++;
+        frags[coded_fragis[fragii++]].dc=coeff;
+      }
+    }
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
+    /*And convert the run_counts array to a moment table.*/
+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
+    /*Finally, subtract off the number of coefficients that have been
+       accounted for by runs started in this coefficient.*/
+    for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli];
+  }
+  _dec->dct_tokens_count=ti;
+  return eobs;
+}
+
+/*Unpacks the AC coefficient tokens.
+  This can completely discard coefficient values while unpacking, and so is
+   somewhat simpler than unpacking the DC coefficient tokens.
+  _huff_idx:   The index of the Huffman table to use for each color plane.
+  _ntoks_left: The number of tokens left to be decoded in each color plane for
+                each coefficient.
+               This is updated as EOB tokens and zero run tokens are decoded.
+  _eobs:       The length of any outstanding EOB run from previous
+                coefficients.
+  Return: The length of any outstanding EOB run.*/
+static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2],
+ ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){
+  unsigned char *dct_tokens;
+  ptrdiff_t      ti;
+  int            pli;
+  dct_tokens=_dec->dct_tokens;
+  ti=_dec->dct_tokens_count;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t eob_count;
+    size_t    ntoks_left;
+    size_t    ntoks;
+    int       rli;
+    _dec->eob_runs[pli][_zzi]=_eobs;
+    _dec->ti0[pli][_zzi]=ti;
+    ntoks_left=_ntoks_left[pli][_zzi];
+    memset(run_counts,0,sizeof(run_counts));
+    eob_count=0;
+    ntoks=0;
+    while(ntoks+_eobs<ntoks_left){
+      int token;
+      int cw;
+      int eb;
+      int skip;
+      ntoks+=_eobs;
+      eob_count+=_eobs;
+      token=oc_huff_token_decode(&_dec->opb,
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
+      }
+      else eb=0;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+      _eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+      if(cw==OC_DCT_CW_FINISH)_eobs=OC_DCT_EOB_FINISH;
+      if(_eobs==0){
+        run_counts[skip]++;
+        ntoks++;
+      }
+    }
+    /*Add the portion of the last EOB run actually used by this coefficient.*/
+    eob_count+=ntoks_left-ntoks;
+    /*And remove it from the remaining EOB count.*/
+    _eobs-=ntoks_left-ntoks;
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
+    /*And convert the run_counts array to a moment table.*/
+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
+    /*Finally, subtract off the number of coefficients that have been
+       accounted for by runs started in this coefficient.*/
+    for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli];
+  }
+  _dec->dct_tokens_count=ti;
+  return _eobs;
+}
+
+/*Tokens describing the DCT coefficients that belong to each fragment are
+   stored in the bitstream grouped by coefficient, not by fragment.
+
+  This means that we either decode all the tokens in order, building up a
+   separate coefficient list for each fragment as we go, and then go back and
+   do the iDCT on each fragment, or we have to create separate lists of tokens
+   for each coefficient, so that we can pull the next token required off the
+   head of the appropriate list when decoding a specific fragment.
+
+  The former was VP3's choice, and it meant 2*w*h extra storage for all the
+   decoded coefficient values.
+
+  We take the second option, which lets us store just one to three bytes per
+   token (generally far fewer than the number of coefficients, due to EOB
+   tokens and zero runs), and which requires us to only maintain a counter for
+   each of the 64 coefficients, instead of a counter for every fragment to
+   determine where the next token goes.
+
+  We actually use 3 counters per coefficient, one for each color plane, so we
+   can decode all color planes simultaneously.
+  This lets color conversion, etc., be done as soon as a full MCU (one or
+   two super block rows) is decoded, while the image data is still in cache.*/
+
+static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){
+  static const unsigned char OC_HUFF_LIST_MAX[5]={1,6,15,28,64};
+  ptrdiff_t  ntoks_left[3][64];
+  int        huff_idxs[2];
+  ptrdiff_t  eobs;
+  long       val;
+  int        pli;
+  int        zzi;
+  int        hgi;
+  for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
+    ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
+  }
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[0]=(int)val;
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[1]=(int)val;
+  _dec->eob_runs[0][0]=0;
+  eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
+#if defined(HAVE_CAIRO)
+  _dec->telemetry_dc_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[0]=(int)val;
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[1]=(int)val;
+  zzi=1;
+  for(hgi=1;hgi<5;hgi++){
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
+    for(;zzi<OC_HUFF_LIST_MAX[hgi];zzi++){
+      eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs);
+    }
+  }
+  /*TODO: eobs should be exactly zero, or 4096 or greater.
+    The second case occurs when an EOB run of size zero is encountered, which
+     gets treated as an infinite EOB run (where infinity is PTRDIFF_MAX).
+    If neither of these conditions holds, then a warning should be issued.*/
+}
+
+
+static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
+  /*pp_level 0: disabled; free any memory used and return*/
+  if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){
+    if(_dec->dc_qis!=NULL){
+      _ogg_free(_dec->dc_qis);
+      _dec->dc_qis=NULL;
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+    }
+    return 1;
+  }
+  if(_dec->dc_qis==NULL){
+    /*If we haven't been tracking DC quantization indices, there's no point in
+       starting now.*/
+    if(_dec->state.frame_type!=OC_INTRA_FRAME)return 1;
+    _dec->dc_qis=(unsigned char *)_ogg_malloc(
+     _dec->state.nfrags*sizeof(_dec->dc_qis[0]));
+    if(_dec->dc_qis==NULL)return 1;
+    memset(_dec->dc_qis,_dec->state.qis[0],_dec->state.nfrags);
+  }
+  else{
+    unsigned char   *dc_qis;
+    const ptrdiff_t *coded_fragis;
+    ptrdiff_t        ncoded_fragis;
+    ptrdiff_t        fragii;
+    unsigned char    qi0;
+    /*Update the DC quantization index of each coded block.*/
+    dc_qis=_dec->dc_qis;
+    coded_fragis=_dec->state.coded_fragis;
+    ncoded_fragis=_dec->state.ncoded_fragis[0]+
+     _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2];
+    qi0=(unsigned char)_dec->state.qis[0];
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      dc_qis[coded_fragis[fragii]]=qi0;
+    }
+  }
+  /*pp_level 1: Stop after updating DC quantization indices.*/
+  if(_dec->pp_level<=OC_PP_LEVEL_TRACKDCQI){
+    if(_dec->variances!=NULL){
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+    }
+    return 1;
+  }
+  if(_dec->variances==NULL){
+    size_t frame_sz;
+    size_t c_sz;
+    int    c_w;
+    int    c_h;
+    frame_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
+    c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
+    c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
+    c_sz=c_w*(size_t)c_h;
+    /*Allocate space for the chroma planes, even if we're not going to use
+       them; this simplifies allocation state management, though it may waste
+       memory on the few systems that don't overcommit pages.*/
+    frame_sz+=c_sz<<1;
+    _dec->pp_frame_data=(unsigned char *)_ogg_malloc(
+     frame_sz*sizeof(_dec->pp_frame_data[0]));
+    _dec->variances=(int *)_ogg_malloc(
+     _dec->state.nfrags*sizeof(_dec->variances[0]));
+    if(_dec->variances==NULL||_dec->pp_frame_data==NULL){
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      return 1;
+    }
+    /*Force an update of the PP buffer pointers.*/
+    _dec->pp_frame_state=0;
+  }
+  /*Update the PP buffer pointers if necessary.*/
+  if(_dec->pp_frame_state!=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC)){
+    if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
+      /*If chroma processing is disabled, just use the PP luma plane.*/
+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
+      _dec->pp_frame_buf[0].stride=-_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data+
+       (1-_dec->pp_frame_buf[0].height)*(ptrdiff_t)_dec->pp_frame_buf[0].stride;
+    }
+    else{
+      size_t y_sz;
+      size_t c_sz;
+      int    c_w;
+      int    c_h;
+      /*Otherwise, set up pointers to all three PP planes.*/
+      y_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
+      c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
+      c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
+      c_sz=c_w*(size_t)c_h;
+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
+      _dec->pp_frame_buf[0].stride=_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data;
+      _dec->pp_frame_buf[1].width=c_w;
+      _dec->pp_frame_buf[1].height=c_h;
+      _dec->pp_frame_buf[1].stride=_dec->pp_frame_buf[1].width;
+      _dec->pp_frame_buf[1].data=_dec->pp_frame_buf[0].data+y_sz;
+      _dec->pp_frame_buf[2].width=c_w;
+      _dec->pp_frame_buf[2].height=c_h;
+      _dec->pp_frame_buf[2].stride=_dec->pp_frame_buf[2].width;
+      _dec->pp_frame_buf[2].data=_dec->pp_frame_buf[1].data+c_sz;
+      oc_ycbcr_buffer_flip(_dec->pp_frame_buf,_dec->pp_frame_buf);
+    }
+    _dec->pp_frame_state=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC);
+  }
+  /*If we're not processing chroma, copy the reference frame's chroma planes.*/
+  if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
+    memcpy(_dec->pp_frame_buf+1,
+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]]+1,
+     sizeof(_dec->pp_frame_buf[1])*2);
+  }
+  return 0;
+}
+
+
+/*Initialize the main decoding pipeline.*/
+static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe){
+  const ptrdiff_t *coded_fragis;
+  const ptrdiff_t *uncoded_fragis;
+  int              flimit;
+  int              pli;
+  int              qii;
+  int              qti;
+  int              zzi;
+  /*If chroma is sub-sampled in the vertical direction, we have to decode two
+     super block rows of Y' for each super block row of Cb and Cr.*/
+  _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
+  /*Initialize the token and extra bits indices for each plane and
+     coefficient.*/
+  memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti));
+  /*Also copy over the initial the EOB run counts.*/
+  memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
+  /*Set up per-plane pointers to the coded and uncoded fragments lists.*/
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t ncoded_fragis;
+    _pipe->coded_fragis[pli]=coded_fragis;
+    _pipe->uncoded_fragis[pli]=uncoded_fragis;
+    ncoded_fragis=_dec->state.ncoded_fragis[pli];
+    coded_fragis+=ncoded_fragis;
+    uncoded_fragis+=ncoded_fragis-_dec->state.fplanes[pli].nfrags;
+  }
+  /*Set up condensed quantizer tables.*/
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<_dec->state.nqis;qii++){
+      for(qti=0;qti<2;qti++){
+        _pipe->dequant[pli][qii][qti]=
+         _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
+      }
+    }
+  }
+  /*Set the previous DC predictor to 0 for all color planes and frame types.*/
+  memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
+  /*Initialize the bounding value array for the loop filter.*/
+  flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
+  /*Initialize any buffers needed for post-processing.
+    We also save the current post-processing level, to guard against the user
+     changing it from a callback.*/
+  if(!oc_dec_postprocess_init(_dec))_pipe->pp_level=_dec->pp_level;
+  /*If we don't have enough information to post-process, disable it, regardless
+     of the user-requested level.*/
+  else{
+    _pipe->pp_level=OC_PP_LEVEL_DISABLED;
+    memcpy(_dec->pp_frame_buf,
+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
+     sizeof(_dec->pp_frame_buf[0])*3);
+  }
+  /*Clear down the DCT coefficient buffer for the first block.*/
+  for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
+}
+
+/*Undo the DC prediction in a single plane of an MCU (one or two super block
+   rows).
+  As a side effect, the number of coded and uncoded fragments in this plane of
+   the MCU is also computed.*/
+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frags;
+  int                     *pred_last;
+  ptrdiff_t                ncoded_fragis;
+  ptrdiff_t                fragi;
+  int                      fragx;
+  int                      fragy;
+  int                      fragy0;
+  int                      fragy_end;
+  int                      nhfrags;
+  /*Compute the first and last fragment row of the current MCU for this
+     plane.*/
+  fplane=_dec->state.fplanes+_pli;
+  fragy0=_pipe->fragy0[_pli];
+  fragy_end=_pipe->fragy_end[_pli];
+  nhfrags=fplane->nhfrags;
+  pred_last=_pipe->pred_last[_pli];
+  frags=_dec->state.frags;
+  ncoded_fragis=0;
+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+  for(fragy=fragy0;fragy<fragy_end;fragy++){
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int refi;
+          refi=frags[fragi].refi;
+          pred_last[refi]=frags[fragi].dc+=pred_last[refi];
+          ncoded_fragis++;
+        }
+      }
+    }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].refi;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else ur_ref=u_frags[fragi+1].refi;
+        if(frags[fragi].coded){
+          int pred;
+          int refi;
+          refi=frags[fragi].refi;
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==refi)|(ul_ref==refi)<<1|
+           (u_ref==refi)<<2|(ur_ref==refi)<<3){
+            default:pred=pred_last[refi];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              /*The TI compiler mis-compiles this line.*/
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          pred_last[refi]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=refi;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  /*Also save the number of uncoded fragments so we know how many to copy.*/
+  _pipe->nuncoded_fragis[_pli]=
+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
+}
+
+/*Reconstructs all coded fragments in a single MCU (one or two super block
+   rows).
+  This requires that each coded fragment have a proper macro block mode and
+   motion vector (if not in INTRA mode), and have its DC value decoded, with
+   the DC prediction process reversed, and the number of coded and uncoded
+   fragments in this plane of the MCU be counted.
+  The token lists for each color plane and coefficient should also be filled
+   in, along with initial token offsets, extra bits offsets, and EOB run
+   counts.*/
+static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  unsigned char       *dct_tokens;
+  const unsigned char *dct_fzig_zag;
+  ogg_uint16_t         dc_quant[2];
+  const oc_fragment   *frags;
+  const ptrdiff_t     *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t            fragii;
+  ptrdiff_t           *ti;
+  ptrdiff_t           *eob_runs;
+  int                  qti;
+  dct_tokens=_dec->dct_tokens;
+  dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
+  frags=_dec->state.frags;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  ti=_pipe->ti[_pli];
+  eob_runs=_pipe->eob_runs[_pli];
+  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
+  for(fragii=0;fragii<ncoded_fragis;fragii++){
+    const ogg_uint16_t *ac_quant;
+    ptrdiff_t           fragi;
+    int                 last_zzi;
+    int                 zzi;
+    fragi=coded_fragis[fragii];
+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
+    ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
+    /*Decode the AC coefficients.*/
+    for(zzi=0;zzi<64;){
+      int token;
+      last_zzi=zzi;
+      if(eob_runs[zzi]){
+        eob_runs[zzi]--;
+        break;
+      }
+      else{
+        ptrdiff_t eob;
+        int       cw;
+        int       rlen;
+        int       coeff;
+        int       lti;
+        lti=ti[zzi];
+        token=dct_tokens[lti++];
+        cw=OC_DCT_CODE_WORD[token];
+        /*These parts could be done branchless, but the branches are fairly
+           predictable and the C code translates into more than a few
+           instructions, so it's worth it to avoid them.*/
+        if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+          cw+=dct_tokens[lti++]<<OC_DCT_TOKEN_EB_POS(token);
+        }
+        eob=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+        if(token==OC_DCT_TOKEN_FAT_EOB){
+          eob+=dct_tokens[lti++]<<8;
+          if(eob==0)eob=OC_DCT_EOB_FINISH;
+        }
+        rlen=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        eob_runs[zzi]=eob;
+        ti[zzi]=lti;
+        zzi+=rlen;
+        _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
+         (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        zzi+=!eob;
+      }
+    }
+    /*TODO: zzi should be exactly 64 here.
+      If it's not, we should report some kind of warning.*/
+    zzi=OC_MINI(zzi,64);
+    _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+    /*last_zzi is always initialized.
+      If your compiler thinks otherwise, it is dumb.*/
+    oc_state_frag_recon(&_dec->state,fragi,_pli,
+     _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
+  }
+  _pipe->coded_fragis[_pli]+=ncoded_fragis;
+  /*Right now the reconstructed MCU has only the coded blocks in it.*/
+  /*TODO: We make the decision here to always copy the uncoded blocks into it
+     from the reference frame.
+    We could also copy the coded blocks back over the reference frame, if we
+     wait for an additional MCU to be decoded, which might be faster if only a
+     small number of blocks are coded.
+    However, this introduces more latency, creating a larger cache footprint.
+    It's unknown which decision is better, but this one results in simpler
+     code, and the hard case (high bitrate, high resolution) is handled
+     correctly.*/
+  /*Copy the uncoded blocks from the previous reference frame.*/
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_dec->state,
+     _dec->state.ref_frame_data[OC_FRAME_SELF],
+     _dec->state.ref_frame_data[OC_FRAME_PREV],
+     _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
+  }
+}
+
+/*Filter a horizontal block edge.*/
+static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,int _qstep,int _flimit,
+ int *_variance0,int *_variance1){
+  unsigned char       *rdst;
+  const unsigned char *rsrc;
+  unsigned char       *cdst;
+  const unsigned char *csrc;
+  int                  r[10];
+  int                  sum0;
+  int                  sum1;
+  int                  bx;
+  int                  by;
+  rdst=_dst;
+  rsrc=_src;
+  for(bx=0;bx<8;bx++){
+    cdst=rdst;
+    csrc=rsrc;
+    for(by=0;by<10;by++){
+      r[by]=*csrc;
+      csrc+=_src_ystride;
+    }
+    sum0=sum1=0;
+    for(by=0;by<4;by++){
+      sum0+=abs(r[by+1]-r[by]);
+      sum1+=abs(r[by+5]-r[by+6]);
+    }
+    *_variance0+=OC_MINI(255,sum0);
+    *_variance1+=OC_MINI(255,sum1);
+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
+      *cdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+      cdst+=_dst_ystride;
+      *cdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+      cdst+=_dst_ystride;
+      for(by=0;by<4;by++){
+        *cdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+
+         r[by+4]+r[by+5]+r[by+6]+4>>3);
+        cdst+=_dst_ystride;
+      }
+      *cdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+      cdst+=_dst_ystride;
+      *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+    }
+    else{
+      for(by=1;by<=8;by++){
+        *cdst=(unsigned char)r[by];
+        cdst+=_dst_ystride;
+      }
+    }
+    rdst++;
+    rsrc++;
+  }
+}
+
+/*Filter a vertical block edge.*/
+static void oc_filter_vedge(unsigned char *_dst,int _dst_ystride,
+ int _qstep,int _flimit,int *_variances){
+  unsigned char       *rdst;
+  const unsigned char *rsrc;
+  unsigned char       *cdst;
+  int                  r[10];
+  int                  sum0;
+  int                  sum1;
+  int                  bx;
+  int                  by;
+  cdst=_dst;
+  for(by=0;by<8;by++){
+    rsrc=cdst-1;
+    rdst=cdst;
+    for(bx=0;bx<10;bx++)r[bx]=*rsrc++;
+    sum0=sum1=0;
+    for(bx=0;bx<4;bx++){
+      sum0+=abs(r[bx+1]-r[bx]);
+      sum1+=abs(r[bx+5]-r[bx+6]);
+    }
+    _variances[0]+=OC_MINI(255,sum0);
+    _variances[1]+=OC_MINI(255,sum1);
+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
+      *rdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+      *rdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+      for(bx=0;bx<4;bx++){
+        *rdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+
+         r[bx+4]+r[bx+5]+r[bx+6]+4>>3);
+      }
+      *rdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+      *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+    }
+    cdst+=_dst_ystride;
+  }
+}
+
+static void oc_dec_deblock_frag_rows(oc_dec_ctx *_dec,
+ th_img_plane *_dst,th_img_plane *_src,int _pli,int _fragy0,
+ int _fragy_end){
+  oc_fragment_plane   *fplane;
+  int                 *variance;
+  unsigned char       *dc_qi;
+  unsigned char       *dst;
+  const unsigned char *src;
+  ptrdiff_t            froffset;
+  int                  dst_ystride;
+  int                  src_ystride;
+  int                  nhfrags;
+  int                  width;
+  int                  notstart;
+  int                  notdone;
+  int                  flimit;
+  int                  qstep;
+  int                  y_end;
+  int                  y;
+  int                  x;
+  _dst+=_pli;
+  _src+=_pli;
+  fplane=_dec->state.fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
+  variance=_dec->variances+froffset;
+  dc_qi=_dec->dc_qis+froffset;
+  notstart=_fragy0>0;
+  notdone=_fragy_end<fplane->nvfrags;
+  /*We want to clear an extra row of variances, except at the end.*/
+  memset(variance+(nhfrags&-notstart),0,
+   (_fragy_end+notdone-_fragy0-notstart)*(nhfrags*sizeof(variance[0])));
+  /*Except for the first time, we want to point to the middle of the row.*/
+  y=(_fragy0<<3)+(notstart<<2);
+  dst_ystride=_dst->stride;
+  src_ystride=_src->stride;
+  dst=_dst->data+y*(ptrdiff_t)dst_ystride;
+  src=_src->data+y*(ptrdiff_t)src_ystride;
+  width=_dst->width;
+  for(;y<4;y++){
+    memcpy(dst,src,width*sizeof(dst[0]));
+    dst+=dst_ystride;
+    src+=src_ystride;
+  }
+  /*We also want to skip the last row in the frame for this loop.*/
+  y_end=_fragy_end-!notdone<<3;
+  for(;y<y_end;y+=8){
+    qstep=_dec->pp_dc_scale[*dc_qi];
+    flimit=(qstep*3)>>2;
+    oc_filter_hedge(dst,dst_ystride,src-src_ystride,src_ystride,
+     qstep,flimit,variance,variance+nhfrags);
+    variance++;
+    dc_qi++;
+    for(x=8;x<width;x+=8){
+      qstep=_dec->pp_dc_scale[*dc_qi];
+      flimit=(qstep*3)>>2;
+      oc_filter_hedge(dst+x,dst_ystride,src+x-src_ystride,src_ystride,
+       qstep,flimit,variance,variance+nhfrags);
+      oc_filter_vedge(dst+x-(dst_ystride<<2)-4,dst_ystride,
+       qstep,flimit,variance-1);
+      variance++;
+      dc_qi++;
+    }
+    dst+=dst_ystride<<3;
+    src+=src_ystride<<3;
+  }
+  /*And finally, handle the last row in the frame, if it's in the range.*/
+  if(!notdone){
+    int height;
+    height=_dst->height;
+    for(;y<height;y++){
+      memcpy(dst,src,width*sizeof(dst[0]));
+      dst+=dst_ystride;
+      src+=src_ystride;
+    }
+    /*Filter the last row of vertical block edges.*/
+    dc_qi++;
+    for(x=8;x<width;x+=8){
+      qstep=_dec->pp_dc_scale[*dc_qi++];
+      flimit=(qstep*3)>>2;
+      oc_filter_vedge(dst+x-(dst_ystride<<3)-4,dst_ystride,
+       qstep,flimit,variance++);
+    }
+  }
+}
+
+static void oc_dering_block(unsigned char *_idata,int _ystride,int _b,
+ int _dc_scale,int _sharp_mod,int _strong){
+  static const unsigned char OC_MOD_MAX[2]={24,32};
+  static const unsigned char OC_MOD_SHIFT[2]={1,0};
+  const unsigned char *psrc;
+  const unsigned char *src;
+  const unsigned char *nsrc;
+  unsigned char       *dst;
+  int                  vmod[72];
+  int                  hmod[72];
+  int                  mod_hi;
+  int                  by;
+  int                  bx;
+  mod_hi=OC_MINI(3*_dc_scale,OC_MOD_MAX[_strong]);
+  dst=_idata;
+  src=dst;
+  psrc=src-(_ystride&-!(_b&4));
+  for(by=0;by<9;by++){
+    for(bx=0;bx<8;bx++){
+      int mod;
+      mod=32+_dc_scale-(abs(src[bx]-psrc[bx])<<OC_MOD_SHIFT[_strong]);
+      vmod[(by<<3)+bx]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+    }
+    psrc=src;
+    src+=_ystride&-(!(_b&8)|by<7);
+  }
+  nsrc=dst;
+  psrc=dst-!(_b&1);
+  for(bx=0;bx<9;bx++){
+    src=nsrc;
+    for(by=0;by<8;by++){
+      int mod;
+      mod=32+_dc_scale-(abs(*src-*psrc)<<OC_MOD_SHIFT[_strong]);
+      hmod[(bx<<3)+by]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+      psrc+=_ystride;
+      src+=_ystride;
+    }
+    psrc=nsrc;
+    nsrc+=!(_b&2)|bx<7;
+  }
+  src=dst;
+  psrc=src-(_ystride&-!(_b&4));
+  nsrc=src+_ystride;
+  for(by=0;by<8;by++){
+    int a;
+    int b;
+    int w;
+    a=128;
+    b=64;
+    w=hmod[by];
+    a-=w;
+    b+=w**(src-!(_b&1));
+    w=vmod[by<<3];
+    a-=w;
+    b+=w*psrc[0];
+    w=vmod[by+1<<3];
+    a-=w;
+    b+=w*nsrc[0];
+    w=hmod[(1<<3)+by];
+    a-=w;
+    b+=w*src[1];
+    dst[0]=OC_CLAMP255(a*src[0]+b>>7);
+    for(bx=1;bx<7;bx++){
+      a=128;
+      b=64;
+      w=hmod[(bx<<3)+by];
+      a-=w;
+      b+=w*src[bx-1];
+      w=vmod[(by<<3)+bx];
+      a-=w;
+      b+=w*psrc[bx];
+      w=vmod[(by+1<<3)+bx];
+      a-=w;
+      b+=w*nsrc[bx];
+      w=hmod[(bx+1<<3)+by];
+      a-=w;
+      b+=w*src[bx+1];
+      dst[bx]=OC_CLAMP255(a*src[bx]+b>>7);
+    }
+    a=128;
+    b=64;
+    w=hmod[(7<<3)+by];
+    a-=w;
+    b+=w*src[6];
+    w=vmod[(by<<3)+7];
+    a-=w;
+    b+=w*psrc[7];
+    w=vmod[(by+1<<3)+7];
+    a-=w;
+    b+=w*nsrc[7];
+    w=hmod[(8<<3)+by];
+    a-=w;
+    b+=w*src[7+!(_b&2)];
+    dst[7]=OC_CLAMP255(a*src[7]+b>>7);
+    dst+=_ystride;
+    psrc=src;
+    src=nsrc;
+    nsrc+=_ystride&-(!(_b&8)|by<6);
+  }
+}
+
+#define OC_DERING_THRESH1 (384)
+#define OC_DERING_THRESH2 (4*OC_DERING_THRESH1)
+#define OC_DERING_THRESH3 (5*OC_DERING_THRESH1)
+#define OC_DERING_THRESH4 (10*OC_DERING_THRESH1)
+
+static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img,
+ int _pli,int _fragy0,int _fragy_end){
+  th_img_plane      *iplane;
+  oc_fragment_plane *fplane;
+  oc_fragment       *frag;
+  int               *variance;
+  unsigned char     *idata;
+  ptrdiff_t          froffset;
+  int                ystride;
+  int                nhfrags;
+  int                sthresh;
+  int                strong;
+  int                y_end;
+  int                width;
+  int                height;
+  int                y;
+  int                x;
+  iplane=_img+_pli;
+  fplane=_dec->state.fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
+  variance=_dec->variances+froffset;
+  frag=_dec->state.frags+froffset;
+  strong=_dec->pp_level>=(_pli?OC_PP_LEVEL_SDERINGC:OC_PP_LEVEL_SDERINGY);
+  sthresh=_pli?OC_DERING_THRESH4:OC_DERING_THRESH3;
+  y=_fragy0<<3;
+  ystride=iplane->stride;
+  idata=iplane->data+y*(ptrdiff_t)ystride;
+  y_end=_fragy_end<<3;
+  width=iplane->width;
+  height=iplane->height;
+  for(;y<y_end;y+=8){
+    for(x=0;x<width;x+=8){
+      int b;
+      int qi;
+      int var;
+      qi=_dec->state.qis[frag->qii];
+      var=*variance;
+      b=(x<=0)|(x+8>=width)<<1|(y<=0)<<2|(y+8>=height)<<3;
+      if(strong&&var>sthresh){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+        if(_pli||!(b&1)&&*(variance-1)>OC_DERING_THRESH4||
+         !(b&2)&&variance[1]>OC_DERING_THRESH4||
+         !(b&4)&&*(variance-nhfrags)>OC_DERING_THRESH4||
+         !(b&8)&&variance[nhfrags]>OC_DERING_THRESH4){
+          oc_dering_block(idata+x,ystride,b,
+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+          oc_dering_block(idata+x,ystride,b,
+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+        }
+      }
+      else if(var>OC_DERING_THRESH2){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+      }
+      else if(var>OC_DERING_THRESH1){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],0);
+      }
+      frag++;
+      variance++;
+    }
+    idata+=ystride<<3;
+  }
+}
+
+
+
+th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
+  oc_dec_ctx *dec;
+  if(_info==NULL||_setup==NULL)return NULL;
+  dec=oc_aligned_malloc(sizeof(*dec),16);
+  if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){
+    oc_aligned_free(dec);
+    return NULL;
+  }
+  dec->state.curframe_num=0;
+  return dec;
+}
+
+void th_decode_free(th_dec_ctx *_dec){
+  if(_dec!=NULL){
+    oc_dec_clear(_dec);
+    oc_aligned_free(_dec);
+  }
+}
+
+int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
+ size_t _buf_sz){
+  switch(_req){
+  case TH_DECCTL_GET_PPLEVEL_MAX:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    (*(int *)_buf)=OC_PP_LEVEL_MAX;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_PPLEVEL:{
+    int pp_level;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    pp_level=*(int *)_buf;
+    if(pp_level<0||pp_level>OC_PP_LEVEL_MAX)return TH_EINVAL;
+    _dec->pp_level=pp_level;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_GRANPOS:{
+    ogg_int64_t granpos;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(ogg_int64_t))return TH_EINVAL;
+    granpos=*(ogg_int64_t *)_buf;
+    if(granpos<0)return TH_EINVAL;
+    _dec->state.granpos=granpos;
+    _dec->state.keyframe_num=(granpos>>_dec->state.info.keyframe_granule_shift)
+     -_dec->state.granpos_bias;
+    _dec->state.curframe_num=_dec->state.keyframe_num
+     +(granpos&(1<<_dec->state.info.keyframe_granule_shift)-1);
+    return 0;
+  }break;
+  case TH_DECCTL_SET_STRIPE_CB:{
+    th_stripe_callback *cb;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(th_stripe_callback))return TH_EINVAL;
+    cb=(th_stripe_callback *)_buf;
+    _dec->stripe_cb.ctx=cb->ctx;
+    _dec->stripe_cb.stripe_decoded=cb->stripe_decoded;
+    return 0;
+  }break;
+#ifdef HAVE_CAIRO
+  case TH_DECCTL_SET_TELEMETRY_MBMODE:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_mbmode=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_MV:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_mv=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_QI:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_qi=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_BITS:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_bits=*(int *)_buf;
+    return 0;
+  }break;
+#endif
+  default:return TH_EIMPL;
+  }
+}
+
+/*We're decoding an INTER frame, but have no initialized reference
+   buffers (i.e., decoding did not start on a key frame).
+  We initialize them to a solid gray here.*/
+static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
+  th_info   *info;
+  size_t     yplane_sz;
+  size_t     cplane_sz;
+  ptrdiff_t  yoffset;
+  int        yhstride;
+  int        yheight;
+  int        chstride;
+  int        cheight;
+  _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
+  _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
+  _dec->state.ref_frame_idx[OC_FRAME_SELF]=0;
+  _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+   _dec->state.ref_frame_data[OC_FRAME_PREV]=
+   _dec->state.ref_frame_data[OC_FRAME_SELF]=
+   _dec->state.ref_frame_bufs[0][0].data;
+  memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0],
+   sizeof(_dec->pp_frame_buf[0])*3);
+  info=&_dec->state.info;
+  yhstride=abs(_dec->state.ref_ystride[0]);
+  yheight=info->frame_height+2*OC_UMV_PADDING;
+  chstride=abs(_dec->state.ref_ystride[1]);
+  cheight=yheight>>!(info->pixel_fmt&2);
+  yplane_sz=yhstride*(size_t)yheight+16;
+  cplane_sz=chstride*(size_t)cheight;
+  yoffset=yhstride*(ptrdiff_t)(yheight-OC_UMV_PADDING-1)+OC_UMV_PADDING;
+  memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz);
+}
+
+#if defined(HAVE_CAIRO)
+static void oc_render_telemetry(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr,
+ int _telemetry){
+  /*Stuff the plane into cairo.*/
+  cairo_surface_t *cs;
+  unsigned char   *data;
+  unsigned char   *y_row;
+  unsigned char   *u_row;
+  unsigned char   *v_row;
+  unsigned char   *rgb_row;
+  int              cstride;
+  int              w;
+  int              h;
+  int              x;
+  int              y;
+  int              hdec;
+  int              vdec;
+  w=_ycbcr[0].width;
+  h=_ycbcr[0].height;
+  hdec=!(_dec->state.info.pixel_fmt&1);
+  vdec=!(_dec->state.info.pixel_fmt&2);
+  /*Lazy data buffer init.
+    We could try to re-use the post-processing buffer, which would save
+     memory, but complicate the allocation logic there.
+    I don't think anyone cares about memory usage when using telemetry; it is
+     not meant for embedded devices.*/
+  if(_dec->telemetry_frame_data==NULL){
+    _dec->telemetry_frame_data=_ogg_malloc(
+     (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data));
+    if(_dec->telemetry_frame_data==NULL)return;
+  }
+  cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h);
+  /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/
+  data=cairo_image_surface_get_data(cs);
+  if(data==NULL){
+    cairo_surface_destroy(cs);
+    return;
+  }
+  cstride=cairo_image_surface_get_stride(cs);
+  y_row=_ycbcr[0].data;
+  u_row=_ycbcr[1].data;
+  v_row=_ycbcr[2].data;
+  rgb_row=data;
+  for(y=0;y<h;y++){
+    for(x=0;x<w;x++){
+      int r;
+      int g;
+      int b;
+      r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200;
+      g=(3827562*y_row[x]-1287801*u_row[x>>hdec]
+       -2672387*v_row[x>>hdec]+447306710)/3287200;
+      b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600;
+      rgb_row[4*x+0]=OC_CLAMP255(b);
+      rgb_row[4*x+1]=OC_CLAMP255(g);
+      rgb_row[4*x+2]=OC_CLAMP255(r);
+    }
+    y_row+=_ycbcr[0].stride;
+    u_row+=_ycbcr[1].stride&-((y&1)|!vdec);
+    v_row+=_ycbcr[2].stride&-((y&1)|!vdec);
+    rgb_row+=cstride;
+  }
+  /*Draw coded identifier for each macroblock (stored in Hilbert order).*/
+  {
+    cairo_t           *c;
+    const oc_fragment *frags;
+    oc_mv             *frag_mvs;
+    const signed char *mb_modes;
+    oc_mb_map         *mb_maps;
+    size_t             nmbs;
+    size_t             mbi;
+    int                row2;
+    int                col2;
+    int                qim[3]={0,0,0};
+    if(_dec->state.nqis==2){
+      int bqi;
+      bqi=_dec->state.qis[0];
+      if(_dec->state.qis[1]>bqi)qim[1]=1;
+      if(_dec->state.qis[1]<bqi)qim[1]=-1;
+    }
+    if(_dec->state.nqis==3){
+      int bqi;
+      int cqi;
+      int dqi;
+      bqi=_dec->state.qis[0];
+      cqi=_dec->state.qis[1];
+      dqi=_dec->state.qis[2];
+      if(cqi>bqi&&dqi>bqi){
+        if(dqi>cqi){
+          qim[1]=1;
+          qim[2]=2;
+        }
+        else{
+          qim[1]=2;
+          qim[2]=1;
+        }
+      }
+      else if(cqi<bqi&&dqi<bqi){
+        if(dqi<cqi){
+          qim[1]=-1;
+          qim[2]=-2;
+        }
+        else{
+          qim[1]=-2;
+          qim[2]=-1;
+        }
+      }
+      else{
+        if(cqi<bqi)qim[1]=-1;
+        else qim[1]=1;
+        if(dqi<bqi)qim[2]=-1;
+        else qim[2]=1;
+      }
+    }
+    c=cairo_create(cs);
+    frags=_dec->state.frags;
+    frag_mvs=_dec->state.frag_mvs;
+    mb_modes=_dec->state.mb_modes;
+    mb_maps=_dec->state.mb_maps;
+    nmbs=_dec->state.nmbs;
+    row2=0;
+    col2=0;
+    for(mbi=0;mbi<nmbs;mbi++){
+      float x;
+      float y;
+      int   bi;
+      y=h-(row2+((col2+1>>1)&1))*16-16;
+      x=(col2>>1)*16;
+      cairo_set_line_width(c,1.);
+      /*Keyframe (all intra) red box.*/
+      if(_dec->state.frame_type==OC_INTRA_FRAME){
+        if(_dec->telemetry_mbmode&0x02){
+          cairo_set_source_rgba(c,1.,0,0,.5);
+          cairo_rectangle(c,x+2.5,y+2.5,11,11);
+          cairo_stroke_preserve(c);
+          cairo_set_source_rgba(c,1.,0,0,.25);
+          cairo_fill(c);
+        }
+      }
+      else{
+        ptrdiff_t fragi;
+        int       frag_mvx;
+        int       frag_mvy;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(fragi>=0&&frags[fragi].coded){
+            frag_mvx=OC_MV_X(frag_mvs[fragi]);
+            frag_mvy=OC_MV_Y(frag_mvs[fragi]);
+            break;
+          }
+        }
+        if(bi<4){
+          switch(mb_modes[mbi]){
+            case OC_MODE_INTRA:{
+              if(_dec->telemetry_mbmode&0x02){
+                cairo_set_source_rgba(c,1.,0,0,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,0,0,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_INTER_NOMV:{
+              if(_dec->telemetry_mbmode&0x01){
+                cairo_set_source_rgba(c,0,0,1.,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,0,0,1.,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV:{
+              if(_dec->telemetry_mbmode&0x04){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x04){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_LAST:{
+              if(_dec->telemetry_mbmode&0x08){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_move_to(c,x+13.5,y+2.5);
+                cairo_line_to(c,x+2.5,y+8);
+                cairo_line_to(c,x+13.5,y+13.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x08){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              if(_dec->telemetry_mbmode&0x10){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_move_to(c,x+8,y+2.5);
+                cairo_line_to(c,x+2.5,y+8);
+                cairo_line_to(c,x+8,y+13.5);
+                cairo_move_to(c,x+13.5,y+2.5);
+                cairo_line_to(c,x+8,y+8);
+                cairo_line_to(c,x+13.5,y+13.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x10){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_GOLDEN_NOMV:{
+              if(_dec->telemetry_mbmode&0x20){
+                cairo_set_source_rgba(c,1.,1.,0,.5);
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,1.,0,.25);
+                cairo_fill(c);
+              }
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              if(_dec->telemetry_mbmode&0x40){
+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                cairo_set_source_rgba(c,1.,1.,0,.5);
+                cairo_stroke(c);
+              }
+              if(_dec->telemetry_mv&0x40){
+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+8,y+8);
+                cairo_stroke(c);
+              }
+            }break;
+            case OC_MODE_INTER_MV_FOUR:{
+              if(_dec->telemetry_mbmode&0x80){
+                cairo_rectangle(c,x+2.5,y+2.5,4,4);
+                cairo_rectangle(c,x+9.5,y+2.5,4,4);
+                cairo_rectangle(c,x+2.5,y+9.5,4,4);
+                cairo_rectangle(c,x+9.5,y+9.5,4,4);
+                cairo_set_source_rgba(c,0,1.,0,.5);
+                cairo_stroke(c);
+              }
+              /*4mv is odd, coded in raster order.*/
+              fragi=mb_maps[mbi][0][0];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+4,y+12);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][1];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+12,y+12);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][2];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+4,y+4);
+                cairo_stroke(c);
+              }
+              fragi=mb_maps[mbi][0][3];
+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
+                cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy);
+                cairo_set_source_rgba(c,1.,1.,1.,.9);
+                cairo_set_line_width(c,3.);
+                cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,2.);
+                cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33);
+                cairo_stroke_preserve(c);
+                cairo_set_line_width(c,1.);
+                cairo_line_to(c,x+12,y+4);
+                cairo_stroke(c);
+              }
+            }break;
+          }
+        }
+      }
+      /*qii illustration.*/
+      if(_dec->telemetry_qi&0x2){
+        cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE);
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          int       qiv;
+          int       xp;
+          int       yp;
+          xp=x+(bi&1)*8;
+          yp=y+8-(bi&2)*4;
+          fragi=mb_maps[mbi][0][bi];
+          if(fragi>=0&&frags[fragi].coded){
+            qiv=qim[frags[fragi].qii];
+            cairo_set_line_width(c,3.);
+            cairo_set_source_rgba(c,0.,0.,0.,.5);
+            switch(qiv){
+              /*Double plus:*/
+              case 2:{
+                if((bi&1)^((bi&2)>>1)){
+                  cairo_move_to(c,xp+2.5,yp+1.5);
+                  cairo_line_to(c,xp+2.5,yp+3.5);
+                  cairo_move_to(c,xp+1.5,yp+2.5);
+                  cairo_line_to(c,xp+3.5,yp+2.5);
+                  cairo_move_to(c,xp+5.5,yp+4.5);
+                  cairo_line_to(c,xp+5.5,yp+6.5);
+                  cairo_move_to(c,xp+4.5,yp+5.5);
+                  cairo_line_to(c,xp+6.5,yp+5.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
+                }
+                else{
+                  cairo_move_to(c,xp+5.5,yp+1.5);
+                  cairo_line_to(c,xp+5.5,yp+3.5);
+                  cairo_move_to(c,xp+4.5,yp+2.5);
+                  cairo_line_to(c,xp+6.5,yp+2.5);
+                  cairo_move_to(c,xp+2.5,yp+4.5);
+                  cairo_line_to(c,xp+2.5,yp+6.5);
+                  cairo_move_to(c,xp+1.5,yp+5.5);
+                  cairo_line_to(c,xp+3.5,yp+5.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
+                }
+              }break;
+              /*Double minus:*/
+              case -2:{
+                cairo_move_to(c,xp+2.5,yp+2.5);
+                cairo_line_to(c,xp+5.5,yp+2.5);
+                cairo_move_to(c,xp+2.5,yp+5.5);
+                cairo_line_to(c,xp+5.5,yp+5.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,1.,1.,1.);
+              }break;
+              /*Plus:*/
+              case 1:{
+                if((bi&2)==0)yp-=2;
+                if((bi&1)==0)xp-=2;
+                cairo_move_to(c,xp+4.5,yp+2.5);
+                cairo_line_to(c,xp+4.5,yp+6.5);
+                cairo_move_to(c,xp+2.5,yp+4.5);
+                cairo_line_to(c,xp+6.5,yp+4.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,.1,1.,.3,1.);
+                break;
+              }
+              /*Fall through.*/
+              /*Minus:*/
+              case -1:{
+                cairo_move_to(c,xp+2.5,yp+4.5);
+                cairo_line_to(c,xp+6.5,yp+4.5);
+                cairo_stroke_preserve(c);
+                cairo_set_source_rgba(c,1.,.3,.1,1.);
+              }break;
+              default:continue;
+            }
+            cairo_set_line_width(c,1.);
+            cairo_stroke(c);
+          }
+        }
+      }
+      col2++;
+      if((col2>>1)>=_dec->state.nhmbs){
+        col2=0;
+        row2+=2;
+      }
+    }
+    /*Bit usage indicator[s]:*/
+    if(_dec->telemetry_bits){
+      int widths[6];
+      int fpsn;
+      int fpsd;
+      int mult;
+      int fullw;
+      int padw;
+      int i;
+      fpsn=_dec->state.info.fps_numerator;
+      fpsd=_dec->state.info.fps_denominator;
+      mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
+      fullw=250.f*h*fpsd*mult/fpsn;
+      padw=w-24;
+      /*Header and coded block bits.*/
+      if(_dec->telemetry_frame_bytes<0||
+       _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
+        _dec->telemetry_frame_bytes=0;
+      }
+      if(_dec->telemetry_coding_bytes<0||
+       _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_coding_bytes=0;
+      }
+      if(_dec->telemetry_mode_bytes<0||
+       _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_mode_bytes=0;
+      }
+      if(_dec->telemetry_mv_bytes<0||
+       _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_mv_bytes=0;
+      }
+      if(_dec->telemetry_qi_bytes<0||
+       _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_qi_bytes=0;
+      }
+      if(_dec->telemetry_dc_bytes<0||
+       _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
+        _dec->telemetry_dc_bytes=0;
+      }
+      widths[0]=padw*
+       (_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
+      widths[1]=padw*
+       (_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
+      widths[2]=padw*
+       (_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;
+      widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw;
+      widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw;
+      widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw;
+      for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w;
+      cairo_set_source_rgba(c,.0,.0,.0,.6);
+      cairo_rectangle(c,10,h-33,widths[0]+1,5);
+      cairo_rectangle(c,10,h-29,widths[1]+1,5);
+      cairo_rectangle(c,10,h-25,widths[2]+1,5);
+      cairo_rectangle(c,10,h-21,widths[3]+1,5);
+      cairo_rectangle(c,10,h-17,widths[4]+1,5);
+      cairo_rectangle(c,10,h-13,widths[5]+1,5);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,1,0,0);
+      cairo_rectangle(c,10.5,h-32.5,widths[0],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,0,1,0);
+      cairo_rectangle(c,10.5,h-28.5,widths[1],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,0,0,1);
+      cairo_rectangle(c,10.5,h-24.5,widths[2],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.6,.4,.0);
+      cairo_rectangle(c,10.5,h-20.5,widths[3],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.3,.3,.3);
+      cairo_rectangle(c,10.5,h-16.5,widths[4],4);
+      cairo_fill(c);
+      cairo_set_source_rgb(c,.5,.5,.8);
+      cairo_rectangle(c,10.5,h-12.5,widths[5],4);
+      cairo_fill(c);
+    }
+    /*Master qi indicator[s]:*/
+    if(_dec->telemetry_qi&0x1){
+      cairo_text_extents_t extents;
+      char                 buffer[10];
+      int                  p;
+      int                  y;
+      p=0;
+      y=h-7.5;
+      if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10;
+      buffer[p++]=48+_dec->state.qis[0]%10;
+      if(_dec->state.nqis>=2){
+        buffer[p++]=' ';
+        if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10;
+        buffer[p++]=48+_dec->state.qis[1]%10;
+      }
+      if(_dec->state.nqis==3){
+        buffer[p++]=' ';
+        if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10;
+        buffer[p++]=48+_dec->state.qis[2]%10;
+      }
+      buffer[p++]='\0';
+      cairo_select_font_face(c,"sans",
+       CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD);
+      cairo_set_font_size(c,18);
+      cairo_text_extents(c,buffer,&extents);
+      cairo_set_source_rgb(c,1,1,1);
+      cairo_move_to(c,w-extents.x_advance-10,y);
+      cairo_show_text(c,buffer);
+      cairo_set_source_rgb(c,0,0,0);
+      cairo_move_to(c,w-extents.x_advance-10,y);
+      cairo_text_path(c,buffer);
+      cairo_set_line_width(c,.8);
+      cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND);
+      cairo_stroke(c);
+    }
+    cairo_destroy(c);
+  }
+  /*Out of the Cairo plane into the telemetry YUV buffer.*/
+  _ycbcr[0].data=_dec->telemetry_frame_data;
+  _ycbcr[0].stride=_ycbcr[0].width;
+  _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride;
+  _ycbcr[1].stride=_ycbcr[1].width;
+  _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride;
+  _ycbcr[2].stride=_ycbcr[2].width;
+  y_row=_ycbcr[0].data;
+  u_row=_ycbcr[1].data;
+  v_row=_ycbcr[2].data;
+  rgb_row=data;
+  /*This is one of the few places it's worth handling chroma on a
+     case-by-case basis.*/
+  switch(_dec->state.info.pixel_fmt){
+    case TH_PF_420:{
+      for(y=0;y<h;y+=2){
+        unsigned char *y_row2;
+        unsigned char *rgb_row2;
+        y_row2=y_row+_ycbcr[0].stride;
+        rgb_row2=rgb_row+cstride;
+        for(x=0;x<w;x+=2){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          y_row[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+           +24966*rgb_row[4*x+4]+4207500)/255000;
+          y_row[x+1]=OC_CLAMP255(y);
+          y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1]
+           +24966*rgb_row2[4*x+0]+4207500)/255000;
+          y_row2[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5]
+           +24966*rgb_row2[4*x+4]+4207500)/255000;
+          y_row2[x+1]=OC_CLAMP255(y);
+          u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6]
+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+           -16436*(rgb_row[4*x+1]+rgb_row[4*x+5]
+           +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+           +24808*(rgb_row[4*x+0]+rgb_row[4*x+4]
+           +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930;
+          v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6]
+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]
+            +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+           -6384*(rgb_row[4*x+0]+rgb_row[4*x+4]
+            +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510;
+          u_row[x>>1]=OC_CLAMP255(u);
+          v_row[x>>1]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride<<1;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride<<1;
+      }
+    }break;
+    case TH_PF_422:{
+      for(y=0;y<h;y++){
+        for(x=0;x<w;x+=2){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          y_row[x]=OC_CLAMP255(y);
+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+           +24966*rgb_row[4*x+4]+4207500)/255000;
+          y_row[x+1]=OC_CLAMP255(y);
+          u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6])
+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5])
+           +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930;
+          v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6])
+           -65744*(rgb_row[4*x+1]+rgb_row[4*x+5])
+           -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510;
+          u_row[x>>1]=OC_CLAMP255(u);
+          v_row[x>>1]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride;
+      }
+    }break;
+    /*case TH_PF_444:*/
+    default:{
+      for(y=0;y<h;y++){
+        for(x=0;x<w;x++){
+          int y;
+          int u;
+          int v;
+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+           +24966*rgb_row[4*x+0]+4207500)/255000;
+          u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1]
+           +99232*rgb_row[4*x+0]+29032005)/225930;
+          v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1]
+           -25536*rgb_row[4*x+0]+45940035)/357510;
+          y_row[x]=OC_CLAMP255(y);
+          u_row[x]=OC_CLAMP255(u);
+          v_row[x]=OC_CLAMP255(v);
+        }
+        y_row+=_ycbcr[0].stride;
+        u_row+=_ycbcr[1].stride;
+        v_row+=_ycbcr[2].stride;
+        rgb_row+=cstride;
+      }
+    }break;
+  }
+  /*Finished.
+    Destroy the surface.*/
+  cairo_surface_destroy(cs);
+}
+#endif
+
+int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
+ ogg_int64_t *_granpos){
+  int ret;
+  if(_dec==NULL||_op==NULL)return TH_EFAULT;
+  /*A completely empty packet indicates a dropped frame and is treated exactly
+     like an inter frame with no coded blocks.*/
+  if(_op->bytes==0){
+    _dec->state.frame_type=OC_INTER_FRAME;
+    _dec->state.ntotal_coded_fragis=0;
+  }
+  else{
+    oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
+    ret=oc_dec_frame_header_unpack(_dec);
+    if(ret<0)return ret;
+    if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec);
+    else oc_dec_coded_flags_unpack(_dec);
+  }
+  /*If there have been no reference frames, and we need one, initialize one.*/
+  if(_dec->state.frame_type!=OC_INTRA_FRAME&&
+   (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
+   _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
+    oc_dec_init_dummy_frame(_dec);
+  }
+  /*If this was an inter frame with no coded blocks...*/
+  if(_dec->state.ntotal_coded_fragis<=0){
+    /*Just update the granule position and return.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    return TH_DUPFRAME;
+  }
+  else{
+    th_ycbcr_buffer stripe_buf;
+    int             stripe_fragy;
+    int             refi;
+    int             pli;
+    int             notstart;
+    int             notdone;
+#ifdef HAVE_CAIRO
+    int             telemetry;
+    /*Save the current telemetry state.
+      This prevents it from being modified in the middle of decoding this
+       frame, which could cause us to skip calls to the striped decoding
+       callback.*/
+    telemetry=_dec->telemetry;
+#endif
+    /*Select a free buffer to use for the reconstructed version of this frame.*/
+    for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
+     refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+    _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+    _dec->state.ref_frame_data[OC_FRAME_SELF]=
+     _dec->state.ref_frame_bufs[refi][0].data;
+#if defined(HAVE_CAIRO)
+    _dec->telemetry_frame_bytes=_op->bytes;
+#endif
+    if(_dec->state.frame_type==OC_INTRA_FRAME){
+      _dec->state.keyframe_num=_dec->state.curframe_num;
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_coding_bytes=
+       _dec->telemetry_mode_bytes=
+       _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    }
+    else{
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+      oc_dec_mb_modes_unpack(_dec);
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_mode_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+      oc_dec_mv_unpack_and_frag_modes_fill(_dec);
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    }
+    oc_dec_block_qis_unpack(_dec);
+#if defined(HAVE_CAIRO)
+    _dec->telemetry_qi_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    oc_dec_residual_tokens_unpack(_dec);
+    /*Update granule position.
+      This must be done before the striped decode callbacks so that the
+       application knows what to do with the frame data.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    /*All of the rest of the operations -- DC prediction reversal,
+       reconstructing coded fragments, copying uncoded fragments, loop
+       filtering, extending borders, and out-of-loop post-processing -- should
+       be pipelined.
+      I.e., DC prediction reversal, reconstruction, and uncoded fragment
+       copying are done for one or two super block rows, then loop filtering is
+       run as far as it can, then bordering copying, then post-processing.
+      For 4:2:0 video a Minimum Codable Unit or MCU contains two luma super
+       block rows, and one chroma.
+      Otherwise, an MCU consists of one super block row from each plane.
+      Inside each MCU, we perform all of the steps on one color plane before
+       moving on to the next.
+      After reconstruction, the additional filtering stages introduce a delay
+       since they need some pixels from the next fragment row.
+      Thus the actual number of decoded rows available is slightly smaller for
+       the first MCU, and slightly larger for the last.
+
+      This entire process allows us to operate on the data while it is still in
+       cache, resulting in big performance improvements.
+      An application callback allows further application processing (blitting
+       to video memory, color conversion, etc.) to also use the data while it's
+       in cache.*/
+    oc_dec_pipeline_init(_dec,&_dec->pipe);
+    oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
+    notstart=0;
+    notdone=1;
+    for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){
+      int avail_fragy0;
+      int avail_fragy_end;
+      avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
+      notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end;
+      for(pli=0;pli<3;pli++){
+        oc_fragment_plane *fplane;
+        int                frag_shift;
+        int                pp_offset;
+        int                sdelay;
+        int                edelay;
+        fplane=_dec->state.fplanes+pli;
+        /*Compute the first and last fragment row of the current MCU for this
+           plane.*/
+        frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
+        _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli);
+        sdelay=edelay=0;
+        if(_dec->pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_state_loop_filter_frag_rows(&_dec->state,
+           _dec->pipe.bounding_values,OC_FRAME_SELF,pli,
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+        }
+        /*To fill the borders, we have an additional two pixel delay, since a
+           fragment in the next row could filter its top edge, using two pixels
+           from a fragment in this row.
+          But there's no reason to delay a full fragment between the two.*/
+        oc_state_borders_fill_rows(&_dec->state,refi,pli,
+         (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+        /*Out-of-loop post-processing.*/
+        pp_offset=3*(pli!=0);
+        if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+          /*Perform de-blocking in one plane.*/
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
+           _dec->state.ref_frame_bufs[refi],pli,
+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+          if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+            /*Perform de-ringing in one plane.*/
+            sdelay+=notstart;
+            edelay+=notdone;
+            oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
+             _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
+          }
+        }
+        /*If no post-processing is done, we still need to delay a row for the
+           loop filter, thanks to the strange filtering order VP3 chose.*/
+        else if(_dec->pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+        }
+        /*Compute the intersection of the available rows in all planes.
+          If chroma is sub-sampled, the effect of each of its delays is
+           doubled, but luma might have more post-processing filters enabled
+           than chroma, so we don't know up front which one is the limiting
+           factor.*/
+        avail_fragy0=OC_MINI(avail_fragy0,
+         _dec->pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy_end=OC_MINI(avail_fragy_end,
+         _dec->pipe.fragy_end[pli]-edelay<<frag_shift);
+      }
+#ifdef HAVE_CAIRO
+      if(_dec->stripe_cb.stripe_decoded!=NULL&&!telemetry){
+#else
+      if(_dec->stripe_cb.stripe_decoded!=NULL){
+#endif
+        /*The callback might want to use the FPU, so let's make sure they can.
+          We violate all kinds of ABI restrictions by not doing this until
+           now, but none of them actually matter since we don't use floating
+           point ourselves.*/
+        oc_restore_fpu(&_dec->state);
+        /*Make the callback, ensuring we flip the sense of the "start" and
+           "end" of the available region upside down.*/
+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
+         _dec->state.fplanes[0].nvfrags-avail_fragy_end,
+         _dec->state.fplanes[0].nvfrags-avail_fragy0);
+      }
+      notstart=1;
+    }
+    /*Finish filling in the reference frame borders.*/
+    for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
+    /*Update the reference frame indices.*/
+    if(_dec->state.frame_type==OC_INTRA_FRAME){
+      /*The new frame becomes both the previous and gold reference frames.*/
+      _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
+    }
+    else{
+      /*Otherwise, just replace the previous reference frame.*/
+      _dec->state.ref_frame_idx[OC_FRAME_PREV]=
+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
+    }
+    /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
+       gamma values, if nothing else).*/
+    oc_restore_fpu(&_dec->state);
+#ifdef HAVE_CAIRO
+    /*If telemetry ioctls are active, we need to draw to the output buffer.*/
+    if(telemetry){
+      oc_render_telemetry(_dec,stripe_buf,telemetry);
+      /*If we had a striped decoding callback, we skipped calling it above
+         (because the telemetry wasn't rendered yet).
+        Call it now with the whole frame.*/
+      if(_dec->stripe_cb.stripe_decoded!=NULL){
+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,
+         stripe_buf,0,_dec->state.fplanes[0].nvfrags);
+      }
+    }
+#endif
+#if defined(OC_DUMP_IMAGES)
+    /*We only dump images if there were some coded blocks.*/
+    oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
+#endif
+    return 0;
+  }
+}
+
+int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){
+  if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT;
+  oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf);
+  return 0;
+}

+ 27 - 0
modules/theoraplayer/native/theora/lib/defexp.awk

@@ -0,0 +1,27 @@
+# awk script to convert symbol export table formats
+
+# converts an msvc .def file to an darwin ld export-symbols-list file
+# we only support the most basic module definition syntax
+
+# skip comments
+/^\w*#.*/ {next}
+/^\w*;.*/ {next}
+
+# remember and propagate the library name
+/LIBRARY/ {name = $2; print "# export list for", name; next}
+
+# skip various other lines
+/^\w*NAME/ ||
+/^\w*VERSION/ ||
+/^\w*EXPORTS/ ||
+/^\w*HEAPSIZE/ ||
+/^\w*STACKSIZE/ ||
+/^\w*STUB/ {next}
+
+# todo: handle SECTIONS
+
+# for symbols, strip the semicolon and mangle the name
+/[a-zA-Z]+/ {sub(/\;/, ""); print "_" $1}
+
+# todo: warn if we see publicname=privatename mappings
+#       which other linkers don't support

+ 182 - 0
modules/theoraplayer/native/theora/lib/dequant.c

@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "dequant.h"
+#include "decint.h"
+
+int oc_quant_params_unpack(oc_pack_buf *_opb,th_quant_info *_qinfo){
+  th_quant_base *base_mats;
+  long           val;
+  int            nbase_mats;
+  int            sizes[64];
+  int            indices[64];
+  int            nbits;
+  int            bmi;
+  int            ci;
+  int            qti;
+  int            pli;
+  int            qri;
+  int            qi;
+  int            i;
+  val=oc_pack_read(_opb,3);
+  nbits=(int)val;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->loop_filter_limits[qi]=(unsigned char)val;
+  }
+  val=oc_pack_read(_opb,4);
+  nbits=(int)val+1;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->ac_scale[qi]=(ogg_uint16_t)val;
+  }
+  val=oc_pack_read(_opb,4);
+  nbits=(int)val+1;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->dc_scale[qi]=(ogg_uint16_t)val;
+  }
+  val=oc_pack_read(_opb,9);
+  nbase_mats=(int)val+1;
+  base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0]));
+  if(base_mats==NULL)return TH_EFAULT;
+  for(bmi=0;bmi<nbase_mats;bmi++){
+    for(ci=0;ci<64;ci++){
+      val=oc_pack_read(_opb,8);
+      base_mats[bmi][ci]=(unsigned char)val;
+    }
+  }
+  nbits=oc_ilog(nbase_mats-1);
+  for(i=0;i<6;i++){
+    th_quant_ranges *qranges;
+    th_quant_base   *qrbms;
+    int             *qrsizes;
+    qti=i/3;
+    pli=i%3;
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    if(i>0){
+      val=oc_pack_read1(_opb);
+      if(!val){
+        int qtj;
+        int plj;
+        if(qti>0){
+          val=oc_pack_read1(_opb);
+          if(val){
+            qtj=qti-1;
+            plj=pli;
+          }
+          else{
+            qtj=(i-1)/3;
+            plj=(i-1)%3;
+          }
+        }
+        else{
+          qtj=(i-1)/3;
+          plj=(i-1)%3;
+        }
+        *qranges=*(_qinfo->qi_ranges[qtj]+plj);
+        continue;
+      }
+    }
+    val=oc_pack_read(_opb,nbits);
+    indices[0]=(int)val;
+    for(qi=qri=0;qi<63;){
+      val=oc_pack_read(_opb,oc_ilog(62-qi));
+      sizes[qri]=(int)val+1;
+      qi+=(int)val+1;
+      val=oc_pack_read(_opb,nbits);
+      indices[++qri]=(int)val;
+    }
+    /*Note: The caller is responsible for cleaning up any partially
+       constructed qinfo.*/
+    if(qi>63){
+      _ogg_free(base_mats);
+      return TH_EBADHEADER;
+    }
+    qranges->nranges=qri;
+    qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0]));
+    if(qranges->sizes==NULL){
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      _ogg_free(base_mats);
+      return TH_EFAULT;
+    }
+    memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0]));
+    qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0]));
+    if(qrbms==NULL){
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      _ogg_free(base_mats);
+      return TH_EFAULT;
+    }
+    qranges->base_matrices=(const th_quant_base *)qrbms;
+    do{
+      bmi=indices[qri];
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      if(bmi>=nbase_mats){
+        _ogg_free(base_mats);
+        return TH_EBADHEADER;
+      }
+      memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri]));
+    }
+    while(qri-->0);
+  }
+  _ogg_free(base_mats);
+  return 0;
+}
+
+void oc_quant_params_clear(th_quant_info *_qinfo){
+  int i;
+  for(i=6;i-->0;){
+    int qti;
+    int pli;
+    qti=i/3;
+    pli=i%3;
+    /*Clear any duplicate pointer references.*/
+    if(i>0){
+      int qtj;
+      int plj;
+      qtj=(i-1)/3;
+      plj=(i-1)%3;
+      if(_qinfo->qi_ranges[qti][pli].sizes==
+       _qinfo->qi_ranges[qtj][plj].sizes){
+        _qinfo->qi_ranges[qti][pli].sizes=NULL;
+      }
+      if(_qinfo->qi_ranges[qti][pli].base_matrices==
+       _qinfo->qi_ranges[qtj][plj].base_matrices){
+        _qinfo->qi_ranges[qti][pli].base_matrices=NULL;
+      }
+    }
+    if(qti>0){
+      if(_qinfo->qi_ranges[1][pli].sizes==
+       _qinfo->qi_ranges[0][pli].sizes){
+        _qinfo->qi_ranges[1][pli].sizes=NULL;
+      }
+      if(_qinfo->qi_ranges[1][pli].base_matrices==
+       _qinfo->qi_ranges[0][pli].base_matrices){
+        _qinfo->qi_ranges[1][pli].base_matrices=NULL;
+      }
+    }
+    /*Now free all the non-duplicate storage.*/
+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].sizes);
+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
+  }
+}

+ 27 - 0
modules/theoraplayer/native/theora/lib/dequant.h

@@ -0,0 +1,27 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_dequant_H)
+# define _dequant_H (1)
+# include "quant.h"
+# include "bitpack.h"
+
+int oc_quant_params_unpack(oc_pack_buf *_opb,
+ th_quant_info *_qinfo);
+void oc_quant_params_clear(th_quant_info *_qinfo);
+
+#endif

+ 168 - 0
modules/theoraplayer/native/theora/lib/encapiwrapper.c

@@ -0,0 +1,168 @@
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+#include "encint.h"
+#include "theora/theoraenc.h"
+
+
+
+static void th_enc_api_clear(th_api_wrapper *_api){
+  if(_api->encode)th_encode_free(_api->encode);
+  memset(_api,0,sizeof(*_api));
+}
+
+static void theora_encode_clear(theora_state *_te){
+  if(_te->i!=NULL)theora_info_clear(_te->i);
+  memset(_te,0,sizeof(*_te));
+}
+
+static int theora_encode_control(theora_state *_te,int _req,
+ void *_buf,size_t _buf_sz){
+  return th_encode_ctl(((th_api_wrapper *)_te->i->codec_setup)->encode,
+   _req,_buf,_buf_sz);
+}
+
+static ogg_int64_t theora_encode_granule_frame(theora_state *_te,
+ ogg_int64_t _gp){
+  return th_granule_frame(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static double theora_encode_granule_time(theora_state *_te,ogg_int64_t _gp){
+  return th_granule_time(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static const oc_state_dispatch_vtable OC_ENC_DISPATCH_VTBL={
+  (oc_state_clear_func)theora_encode_clear,
+  (oc_state_control_func)theora_encode_control,
+  (oc_state_granule_frame_func)theora_encode_granule_frame,
+  (oc_state_granule_time_func)theora_encode_granule_time,
+};
+
+int theora_encode_init(theora_state *_te,theora_info *_ci){
+  th_api_info *apiinfo;
+  th_info      info;
+  ogg_uint32_t keyframe_frequency_force;
+  /*Allocate our own combined API wrapper/theora_info struct.
+    We put them both in one malloc'd block so that when the API wrapper is
+     freed, the info struct goes with it.
+    This avoids having to figure out whether or not we need to free the info
+     struct in either theora_info_clear() or theora_clear().*/
+  apiinfo=(th_api_info *)_ogg_malloc(sizeof(*apiinfo));
+  if(apiinfo==NULL)return TH_EFAULT;
+  /*Make our own copy of the info struct, since its lifetime should be
+     independent of the one we were passed in.*/
+  *&apiinfo->info=*_ci;
+  oc_theora_info2th_info(&info,_ci);
+  apiinfo->api.encode=th_encode_alloc(&info);
+  if(apiinfo->api.encode==NULL){
+    _ogg_free(apiinfo);
+    return OC_EINVAL;
+  }
+  apiinfo->api.clear=(oc_setup_clear_func)th_enc_api_clear;
+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
+  _te->internal_encode=(void *)&OC_ENC_DISPATCH_VTBL;
+  _te->internal_decode=NULL;
+  _te->granulepos=0;
+  _te->i=&apiinfo->info;
+  _te->i->codec_setup=&apiinfo->api;
+  /*Set the precise requested keyframe frequency.*/
+  keyframe_frequency_force=_ci->keyframe_auto_p?
+   _ci->keyframe_frequency_force:_ci->keyframe_frequency;
+  th_encode_ctl(apiinfo->api.encode,
+   TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
+   &keyframe_frequency_force,sizeof(keyframe_frequency_force));
+  /*TODO: Additional codec setup using the extra fields in theora_info.*/
+  return 0;
+}
+
+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
+  th_api_wrapper  *api;
+  th_ycbcr_buffer  buf;
+  int              ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  buf[0].width=_yuv->y_width;
+  buf[0].height=_yuv->y_height;
+  buf[0].stride=_yuv->y_stride;
+  buf[0].data=_yuv->y;
+  buf[1].width=_yuv->uv_width;
+  buf[1].height=_yuv->uv_height;
+  buf[1].stride=_yuv->uv_stride;
+  buf[1].data=_yuv->u;
+  buf[2].width=_yuv->uv_width;
+  buf[2].height=_yuv->uv_height;
+  buf[2].stride=_yuv->uv_stride;
+  buf[2].data=_yuv->v;
+  ret=th_encode_ycbcr_in(api->encode,buf);
+  if(ret<0)return ret;
+  _te->granulepos=api->encode->state.granpos;
+  return ret;
+}
+
+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
+  th_api_wrapper *api;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  return th_encode_packetout(api->encode,_last_p,_op);
+}
+
+int theora_encode_header(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output an info packet.*/
+  enc->packet_state=OC_PACKET_INFO_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}
+
+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
+  oggpack_buffer  opb;
+  void           *buf;
+  int             packet_state;
+  int             ret;
+  packet_state=OC_PACKET_COMMENT_HDR;
+  oggpackB_writeinit(&opb);
+  ret=oc_state_flushheader(NULL,&packet_state,&opb,NULL,NULL,
+   th_version_string(),(th_comment *)_tc,_op);
+  if(ret>=0){
+    /*The oggpack_buffer's lifetime ends with this function, so we have to
+       copy out the packet contents.
+      Presumably the application knows it is supposed to free this.
+      This part works nothing like the Vorbis API, and the documentation on it
+       has been wrong for some time, claiming libtheora owned the memory.*/
+    buf=_ogg_malloc(_op->bytes);
+    if(buf==NULL){
+      _op->packet=NULL;
+      ret=TH_EFAULT;
+    }
+    else{
+      memcpy(buf,_op->packet,_op->bytes);
+      _op->packet=buf;
+      ret=0;
+    }
+  }
+  oggpack_writeclear(&opb);
+  return ret;
+}
+
+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output a setup packet.*/
+  enc->packet_state=OC_PACKET_SETUP_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}

+ 379 - 0
modules/theoraplayer/native/theora/lib/encfrag.c

@@ -0,0 +1,379 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encfrag.c 17821 2011-02-09 22:08:34Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+}
+
+void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
+ const unsigned char *_src,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-128);
+    _src+=_ystride;
+  }
+}
+
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-(_ref1[j]+_ref2[j]>>1));
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref1+=_ystride;
+    _ref2+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
+  const unsigned char *src = _src;
+  unsigned dc;
+  unsigned sad;
+  int      i;
+  dc=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)dc+=src[j];
+    src+=_ystride;
+  }
+  dc=dc+32>>6;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
+    _src+=_ystride;
+  }
+  return sad;
+}
+
+static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    t0=_src[0]-_ref[0]+_src[4]-_ref[4];
+    t4=_src[0]-_ref[0]-_src[4]+_ref[4];
+    t1=_src[1]-_ref[1]+_src[5]-_ref[5];
+    t5=_src[1]-_ref[1]-_src[5]+_ref[5];
+    t2=_src[2]-_ref[2]+_src[6]-_ref[6];
+    t6=_src[2]-_ref[2]-_src[6]+_ref[6];
+    t3=_src[3]-_ref[3]+_src[7]-_ref[7];
+    t7=_src[3]-_ref[3]-_src[7]+_ref[7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+}
+
+static void oc_diff_hadamard2(ogg_int16_t _buf[64],const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    r=_ref1[0]+_ref2[0]>>1;
+    t4=_ref1[4]+_ref2[4]>>1;
+    t0=_src[0]-r+_src[4]-t4;
+    t4=_src[0]-r-_src[4]+t4;
+    r=_ref1[1]+_ref2[1]>>1;
+    t5=_ref1[5]+_ref2[5]>>1;
+    t1=_src[1]-r+_src[5]-t5;
+    t5=_src[1]-r-_src[5]+t5;
+    r=_ref1[2]+_ref2[2]>>1;
+    t6=_ref1[6]+_ref2[6]>>1;
+    t2=_src[2]-r+_src[6]-t6;
+    t6=_src[2]-r-_src[6]+t6;
+    r=_ref1[3]+_ref2[3]>>1;
+    t7=_ref1[7]+_ref2[7]>>1;
+    t3=_src[3]-r+_src[7]-t7;
+    t7=_src[3]-r-_src[7]+t7;
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+    _ref1+=_ystride;
+    _ref2+=_ystride;
+  }
+}
+
+static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
+ int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    t0=_src[0]+_src[4];
+    t4=_src[0]-_src[4];
+    t1=_src[1]+_src[5];
+    t5=_src[1]-_src[5];
+    t2=_src[2]+_src[6];
+    t6=_src[2]-_src[6];
+    t3=_src[3]+_src[7];
+    t7=_src[3]-_src[7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+  }
+}
+
+unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
+  unsigned sad;
+  int      dc;
+  int      t0;
+  int      t1;
+  int      t2;
+  int      t3;
+  int      t4;
+  int      t5;
+  int      t6;
+  int      t7;
+  int      r;
+  int      i;
+  sad=dc=0;
+  for(i=0;i<8;i++){
+    /*Hadamard stage 1:*/
+    t0=_buf[i*8+0]+_buf[i*8+4];
+    t4=_buf[i*8+0]-_buf[i*8+4];
+    t1=_buf[i*8+1]+_buf[i*8+5];
+    t5=_buf[i*8+1]-_buf[i*8+5];
+    t2=_buf[i*8+2]+_buf[i*8+6];
+    t6=_buf[i*8+2]-_buf[i*8+6];
+    t3=_buf[i*8+3]+_buf[i*8+7];
+    t7=_buf[i*8+3]-_buf[i*8+7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    r=abs(t0+t1)&-(i>0);
+    r+=abs(t0-t1);
+    r+=abs(t2+t3);
+    r+=abs(t2-t3);
+    r+=abs(t4+t5);
+    r+=abs(t4-t5);
+    r+=abs(t6+t7);
+    r+=abs(t6-t7);
+    sad+=r;
+  }
+  dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
+  *_dc=dc;
+  return sad;
+}
+
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ogg_int16_t buf[64];
+  oc_diff_hadamard(buf,_src,_ref,_ystride);
+  return oc_hadamard_sad(_dc,buf);
+}
+
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  ogg_int16_t buf[64];
+  oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
+  return oc_hadamard_sad(_dc,buf);
+}
+
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
+ const unsigned char *_src,int _ystride){
+  ogg_int16_t buf[64];
+  oc_intra_hadamard(buf,_src,_ystride);
+  return oc_hadamard_sad(_dc,buf);
+}
+
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
+}
+
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++,_mask>>=1){
+      if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    }
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
+}
+
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  int i;
+  int j;
+  for(i=8;i-->0;){
+    for(j=0;j<8;j++)_dst[j]=_src1[j]+_src2[j]>>1;
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}

+ 121 - 0
modules/theoraplayer/native/theora/lib/encinfo.c

@@ -0,0 +1,121 @@
+#include <stdlib.h>
+#include <string.h>
+#include "state.h"
+#include "enquant.h"
+#include "huffenc.h"
+
+
+
+/*Packs a series of octets from a given byte array into the pack buffer.
+  _opb: The pack buffer to store the octets in.
+  _buf: The byte array containing the bytes to pack.
+  _len: The number of octets to pack.*/
+static void oc_pack_octets(oggpack_buffer *_opb,const char *_buf,int _len){
+  int i;
+  for(i=0;i<_len;i++)oggpackB_write(_opb,_buf[i],8);
+}
+
+
+
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op){
+  unsigned char *packet;
+  int            b_o_s;
+  if(_op==NULL)return TH_EFAULT;
+  switch(*_packet_state){
+    /*Codec info header.*/
+    case OC_PACKET_INFO_HDR:{
+      if(_state==NULL)return TH_EFAULT;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the info header.*/
+      oggpackB_write(_opb,0x80,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the codec bitstream version.*/
+      oggpackB_write(_opb,TH_VERSION_MAJOR,8);
+      oggpackB_write(_opb,TH_VERSION_MINOR,8);
+      oggpackB_write(_opb,TH_VERSION_SUB,8);
+      /*Describe the encoded frame.*/
+      oggpackB_write(_opb,_state->info.frame_width>>4,16);
+      oggpackB_write(_opb,_state->info.frame_height>>4,16);
+      oggpackB_write(_opb,_state->info.pic_width,24);
+      oggpackB_write(_opb,_state->info.pic_height,24);
+      oggpackB_write(_opb,_state->info.pic_x,8);
+      oggpackB_write(_opb,_state->info.pic_y,8);
+      oggpackB_write(_opb,_state->info.fps_numerator,32);
+      oggpackB_write(_opb,_state->info.fps_denominator,32);
+      oggpackB_write(_opb,_state->info.aspect_numerator,24);
+      oggpackB_write(_opb,_state->info.aspect_denominator,24);
+      oggpackB_write(_opb,_state->info.colorspace,8);
+      oggpackB_write(_opb,_state->info.target_bitrate,24);
+      oggpackB_write(_opb,_state->info.quality,6);
+      oggpackB_write(_opb,_state->info.keyframe_granule_shift,5);
+      oggpackB_write(_opb,_state->info.pixel_fmt,2);
+      /*Spare configuration bits.*/
+      oggpackB_write(_opb,0,3);
+      b_o_s=1;
+    }break;
+    /*Comment header.*/
+    case OC_PACKET_COMMENT_HDR:{
+      int vendor_len;
+      int i;
+      if(_tc==NULL)return TH_EFAULT;
+      vendor_len=strlen(_vendor);
+      oggpackB_reset(_opb);
+      /*Mark this packet as the comment header.*/
+      oggpackB_write(_opb,0x81,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the vendor string.*/
+      oggpack_write(_opb,vendor_len,32);
+      oc_pack_octets(_opb,_vendor,vendor_len);
+      oggpack_write(_opb,_tc->comments,32);
+      for(i=0;i<_tc->comments;i++){
+        if(_tc->user_comments[i]!=NULL){
+          oggpack_write(_opb,_tc->comment_lengths[i],32);
+          oc_pack_octets(_opb,_tc->user_comments[i],_tc->comment_lengths[i]);
+        }
+        else oggpack_write(_opb,0,32);
+      }
+      b_o_s=0;
+    }break;
+    /*Codec setup header.*/
+    case OC_PACKET_SETUP_HDR:{
+      int ret;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the setup header.*/
+      oggpackB_write(_opb,0x82,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the quantizer tables.*/
+      oc_quant_params_pack(_opb,_qinfo);
+      /*Write the huffman codes.*/
+      ret=oc_huff_codes_pack(_opb,_codes);
+      /*This should never happen, because we validate the tables when they
+         are set.
+        If you see, it's a good chance memory is being corrupted.*/
+      if(ret<0)return ret;
+      b_o_s=0;
+    }break;
+    /*No more headers to emit.*/
+    default:return 0;
+  }
+  /*This is kind of fugly: we hand the user a buffer which they do not own.
+    We will overwrite it when the next packet is output, so the user better be
+     done with it by then.
+    Vorbis is little better: it hands back buffers that it will free the next
+     time the headers are requested, or when the encoder is cleared.
+    Hopefully libogg2 will make this much cleaner.*/
+  packet=oggpackB_get_buffer(_opb);
+  /*If there's no packet, malloc failed while writing.*/
+  if(packet==NULL)return TH_EFAULT;
+  _op->packet=packet;
+  _op->bytes=oggpackB_bytes(_opb);
+  _op->b_o_s=b_o_s;
+  _op->e_o_s=0;
+  _op->granulepos=0;
+  _op->packetno=*_packet_state+3;
+  return ++(*_packet_state)+3;
+}

+ 845 - 0
modules/theoraplayer/native/theora/lib/encint.h

@@ -0,0 +1,845 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encint.h 18223 2012-03-31 18:49:57Z gmaxwell $
+
+ ********************************************************************/
+#if !defined(_encint_H)
+# define _encint_H (1)
+# include "theora/theoraenc.h"
+# include "state.h"
+# include "mathops.h"
+# include "enquant.h"
+# include "huffenc.h"
+/*# define OC_COLLECT_METRICS*/
+
+
+
+typedef oc_mv                         oc_mv2[2];
+
+typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
+typedef struct oc_enc_opt_data        oc_enc_opt_data;
+typedef struct oc_mb_enc_info         oc_mb_enc_info;
+typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_fr_state            oc_fr_state;
+typedef struct oc_qii_state           oc_qii_state;
+typedef struct oc_enc_pipeline_state  oc_enc_pipeline_state;
+typedef struct oc_mode_rd             oc_mode_rd;
+typedef struct oc_iir_filter          oc_iir_filter;
+typedef struct oc_frame_metrics       oc_frame_metrics;
+typedef struct oc_rc_state            oc_rc_state;
+typedef struct th_enc_ctx             oc_enc_ctx;
+typedef struct oc_token_checkpoint    oc_token_checkpoint;
+
+
+
+/*Encoder-specific accelerated functions.*/
+# if defined(OC_X86_ASM)
+#  if defined(_MSC_VER)
+#   include "x86_vc/x86enc.h"
+#  else
+#   include "x86/x86enc.h"
+#  endif
+# endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armenc.h"
+# endif
+
+# if !defined(oc_enc_accel_init)
+#  define oc_enc_accel_init oc_enc_accel_init_c
+# endif
+# if defined(OC_ENC_USE_VTABLE)
+#  if !defined(oc_enc_frag_sub)
+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sub_128)
+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sad)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_sad_thresh)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  ((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh))
+#  endif
+#  if !defined(oc_enc_frag_sad2_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  ((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
+#  endif
+#  if !defined(oc_enc_frag_intra_sad)
+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_satd)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_satd2)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  ((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_intra_satd)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  ((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_ssd)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  ((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride))
+#  endif
+#  if !defined(oc_enc_frag_border_ssd)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  ((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask))
+#  endif
+#  if !defined(oc_enc_frag_copy2)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  ((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride))
+#  endif
+#  if !defined(oc_enc_enquant_table_init)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  ((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant))
+#  endif
+#  if !defined(oc_enc_enquant_table_fixup)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  ((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis))
+#  endif
+#  if !defined(oc_enc_quantize)
+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  ((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant))
+#  endif
+#  if !defined(oc_enc_frag_recon_intra)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  ((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue))
+#  endif
+#  if !defined(oc_enc_frag_recon_inter)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  ((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
+#  endif
+#  if !defined(oc_enc_fdct8x8)
+#   define oc_enc_fdct8x8(_enc,_y,_x) \
+  ((*(_enc)->opt_vtable.fdct8x8)(_y,_x))
+#  endif
+# else
+#  if !defined(oc_enc_frag_sub)
+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
+  oc_enc_frag_sub_c(_diff,_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sub_128)
+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
+  oc_enc_frag_sub_128_c(_diff,_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sad)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_sad_c(_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_sad_thresh)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh)
+#  endif
+#  if !defined(oc_enc_frag_sad2_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
+#  endif
+#  if !defined(oc_enc_frag_intra_sad)
+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
+  oc_enc_frag_intra_sad_c(_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_satd)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_satd2)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_intra_satd)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  oc_enc_frag_intra_satd_c(_dc,_src,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_ssd)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_ssd_c(_src,_ref,_ystride)
+#  endif
+#  if !defined(oc_enc_frag_border_ssd)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask)
+#  endif
+#  if !defined(oc_enc_frag_copy2)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride)
+#  endif
+#  if !defined(oc_enc_enquant_table_init)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  oc_enc_enquant_table_init_c(_enquant,_dequant)
+#  endif
+#  if !defined(oc_enc_enquant_table_fixup)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  oc_enc_enquant_table_fixup_c(_enquant,_nqis)
+#  endif
+#  if !defined(oc_enc_quantize)
+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant)
+#  endif
+#  if !defined(oc_enc_frag_recon_intra)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_c(_dst,_ystride,_residue)
+#  endif
+#  if !defined(oc_enc_frag_recon_inter)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
+#  endif
+#  if !defined(oc_enc_fdct8x8)
+#   define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x)
+#  endif
+# endif
+
+
+
+/*Constants for the packet-out state machine specific to the encoder.*/
+
+/*Next packet to emit: Data packet, but none are ready yet.*/
+#define OC_PACKET_EMPTY (0)
+/*Next packet to emit: Data packet, and one is ready.*/
+#define OC_PACKET_READY (1)
+
+/*All features enabled.*/
+#define OC_SP_LEVEL_SLOW          (0)
+/*Enable early skip.*/
+#define OC_SP_LEVEL_EARLY_SKIP    (1)
+/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
+#define OC_SP_LEVEL_FAST_ANALYSIS (2)
+/*Use SAD instead of SATD*/
+#define OC_SP_LEVEL_NOSATD        (3)
+/*Disable motion compensation.*/
+#define OC_SP_LEVEL_NOMC          (4)
+/*Maximum valid speed level.*/
+#define OC_SP_LEVEL_MAX           (4)
+
+
+/*The number of extra bits of precision at which to store rate metrics.*/
+# define OC_BIT_SCALE  (6)
+/*The number of extra bits of precision at which to store RMSE metrics.
+  This must be at least half OC_BIT_SCALE (rounded up).*/
+# define OC_RMSE_SCALE (5)
+/*The number of quantizer bins to partition statistics into.*/
+# define OC_LOGQ_BINS  (8)
+/*The number of SAD/SATD bins to partition statistics into.*/
+# define OC_COMP_BINS   (24)
+/*The number of bits of precision to drop from SAD and SATD scores
+   to assign them to a bin.*/
+# define OC_SAD_SHIFT  (6)
+# define OC_SATD_SHIFT (9)
+
+/*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
+   or the lambda parameter (via rd_iscale).
+  These are only equivalent within a single block; when more than one block is
+   being considered, the former is the interpretation used.*/
+
+/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/
+# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE)
+# define OC_RD_ISCALE_BITS (11)
+
+/*This macro is applied to _ssd values with just 4 bits of headroom
+   ((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as
+   large as 16, and need additional fractional bits, our only recourse that
+   doesn't lose precision on blocks with very small SSDs is to use a wider
+   multiply.*/
+# if LONG_MAX>2147483647
+#  define OC_RD_SCALE(_ssd,_rd_scale) \
+ ((unsigned)((unsigned long)(_ssd)*(_rd_scale) \
+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
+# else
+#  define OC_RD_SCALE(_ssd,_rd_scale) \
+ (((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \
+ +(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \
+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
+# endif
+# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \
+ ((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4)
+# define OC_RD_ISCALE(_lambda,_rd_iscale) \
+ ((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS)
+
+
+/*The bits used for each of the MB mode codebooks.*/
+extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
+
+/*The bits used for each of the MV codebooks.*/
+extern const unsigned char OC_MV_BITS[2][64];
+
+/*The minimum value that can be stored in a SB run for each codeword.
+  The last entry is the upper bound on the length of a single SB run.*/
+extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
+/*The bits used for each SB run codeword.*/
+extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
+
+/*The bits used for each block run length (starting with 1).*/
+extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
+
+
+
+/*Encoder specific functions with accelerated variants.*/
+struct oc_enc_opt_vtable{
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
+  unsigned (*frag_sad)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_sad_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
+  unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+  unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
+  unsigned (*frag_ssd)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_border_ssd)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+  void     (*frag_copy2)(unsigned char *_dst,
+   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*enquant_table_init)(void *_enquant,
+   const ogg_uint16_t _dequant[64]);
+  void     (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
+  int      (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+   const ogg_uint16_t _dequant[64],const void *_enquant);
+  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void     (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+};
+
+
+/*Encoder specific data that varies according to which variants of the above
+   functions are used.*/
+struct oc_enc_opt_data{
+  /*The size of a single quantizer table.
+    This must be a multiple of enquant_table_alignment.*/
+  size_t               enquant_table_size;
+  /*The alignment required for the quantizer tables.
+    This must be a positive power of two.*/
+  int                  enquant_table_alignment;
+};
+
+
+void oc_enc_accel_init(oc_enc_ctx *_enc);
+
+
+
+/*Encoder-specific macroblock information.*/
+struct oc_mb_enc_info{
+  /*Neighboring macro blocks that have MVs available from the current frame.*/
+  unsigned      cneighbors[4];
+  /*Neighboring macro blocks to use for MVs from the previous frame.*/
+  unsigned      pneighbors[4];
+  /*The number of current-frame neighbors.*/
+  unsigned char ncneighbors;
+  /*The number of previous-frame neighbors.*/
+  unsigned char npneighbors;
+  /*Flags indicating which MB modes have been refined.*/
+  unsigned char refined;
+  /*Motion vectors for a macro block for the current frame and the
+     previous two frames.
+    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
+     can be used to estimate constant velocity and constant acceleration
+     predictors.
+    Uninitialized MVs are (0,0).*/
+  oc_mv2        analysis_mv[3];
+  /*Current unrefined analysis MVs.*/
+  oc_mv         unref_mv[2];
+  /*Unrefined block MVs.*/
+  oc_mv         block_mv[4];
+  /*Refined block MVs.*/
+  oc_mv         ref_mv[4];
+  /*Minimum motion estimation error from the analysis stage.*/
+  ogg_uint16_t  error[2];
+  /*MB error for half-pel refinement for each frame type.*/
+  unsigned      satd[2];
+  /*Block error for half-pel refinement.*/
+  unsigned      block_satd[4];
+};
+
+
+
+/*State machine to estimate the opportunity cost of coding a MB mode.*/
+struct oc_mode_scheme_chooser{
+  /*Pointers to the a list containing the index of each mode in the mode
+     alphabet used by each scheme.
+    The first entry points to the dynamic scheme0_ranks, while the remaining 7
+     point to the constant entries stored in OC_MODE_SCHEMES.*/
+  const unsigned char *mode_ranks[8];
+  /*The ranks for each mode when coded with scheme 0.
+    These are optimized so that the more frequent modes have lower ranks.*/
+  unsigned char        scheme0_ranks[OC_NMODES];
+  /*The list of modes, sorted in descending order of frequency, that
+    corresponds to the ranks above.*/
+  unsigned char        scheme0_list[OC_NMODES];
+  /*The number of times each mode has been chosen so far.*/
+  unsigned             mode_counts[OC_NMODES];
+  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
+  unsigned char        scheme_list[8];
+  /*The number of bits used by each mode coding scheme.*/
+  ptrdiff_t            scheme_bits[8];
+};
+
+
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
+
+
+
+/*State to track coded block flags and their bit cost.
+  We use opportunity cost to measure the bits required to code or skip the next
+   block, using the cheaper of the cost to code it fully or partially, so long
+   as both are possible.*/
+struct oc_fr_state{
+  /*The number of bits required for the coded block flags so far this frame.*/
+  ptrdiff_t  bits;
+  /*The length of the current run for the partial super block flag, not
+     including the current super block.*/
+  unsigned   sb_partial_count:16;
+  /*The length of the current run for the full super block flag, not
+     including the current super block.*/
+  unsigned   sb_full_count:16;
+  /*The length of the coded block flag run when the current super block
+     started.*/
+  unsigned   b_coded_count_prev:6;
+  /*The coded block flag when the current super block started.*/
+  signed int b_coded_prev:2;
+  /*The length of the current coded block flag run.*/
+  unsigned   b_coded_count:6;
+  /*The current coded block flag.*/
+  signed int b_coded:2;
+  /*The number of blocks processed in the current super block.*/
+  unsigned   b_count:5;
+  /*Whether or not it is cheaper to code the current super block partially,
+     even if it could still be coded fully.*/
+  unsigned   sb_prefer_partial:1;
+  /*Whether the last super block was coded partially.*/
+  signed int sb_partial:2;
+  /*The number of bits required for the flags for the current super block.*/
+  unsigned   sb_bits:6;
+  /*Whether the last non-partial super block was coded fully.*/
+  signed int sb_full:2;
+};
+
+
+
+struct oc_qii_state{
+  ptrdiff_t  bits;
+  unsigned   qi01_count:14;
+  signed int qi01:2;
+  unsigned   qi12_count:14;
+  signed int qi12:2;
+};
+
+
+
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+  /*DCT coefficient storage.
+    This is kept off the stack because a) gcc can't align things on the stack
+     reliably on ARM, and b) it avoids (unintentional) data hazards between
+     ARM and NEON code.*/
+  OC_ALIGN16(ogg_int16_t dct_data[64*3]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  oc_fr_state         fr[3];
+  oc_qii_state        qs[3];
+  /*Skip SSD storage for the current MCU in each plane.*/
+  unsigned           *skip_ssd[3];
+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+  ptrdiff_t          *coded_fragis[3];
+  ptrdiff_t          *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  /*The starting fragment for the current MCU in each plane.*/
+  ptrdiff_t           froffset[3];
+  /*The starting row for the current MCU in each plane.*/
+  int                 fragy0[3];
+  /*The ending row for the current MCU in each plane.*/
+  int                 fragy_end[3];
+  /*The starting superblock for the current MCU in each plane.*/
+  unsigned            sbi0[3];
+  /*The ending superblock for the current MCU in each plane.*/
+  unsigned            sbi_end[3];
+  /*The number of tokens for zzi=1 for each color plane.*/
+  int                 ndct_tokens1[3];
+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
+  int                 eob_run1[3];
+  /*Whether or not the loop filter is enabled.*/
+  int                 loop_filter;
+};
+
+
+
+/*Statistics used to estimate R-D cost of a block in a given coding mode.
+  See modedec.h for more details.*/
+struct oc_mode_rd{
+  /*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/
+  ogg_int16_t rate;
+  /*The expected square root of the sum of squared errors, shifted by
+     OC_RMSE_SCALE.*/
+  ogg_int16_t rmse;
+};
+
+# if defined(OC_COLLECT_METRICS)
+#  include "collect.h"
+# endif
+
+
+
+/*A 2nd order low-pass Bessel follower.
+  We use this for rate control because it has fast reaction time, but is
+   critically damped.*/
+struct oc_iir_filter{
+  ogg_int32_t c[2];
+  ogg_int64_t g;
+  ogg_int32_t x[2];
+  ogg_int32_t y[2];
+};
+
+
+
+/*The 2-pass metrics associated with a single frame.*/
+struct oc_frame_metrics{
+  /*The log base 2 of the scale factor for this frame in Q24 format.*/
+  ogg_int32_t   log_scale;
+  /*The number of application-requested duplicates of this frame.*/
+  unsigned      dup_count:31;
+  /*The frame type from pass 1.*/
+  unsigned      frame_type:1;
+  /*The frame activity average from pass 1.*/
+  unsigned      activity_avg;
+};
+
+
+
+/*Rate control state information.*/
+struct oc_rc_state{
+  /*The target average bits per frame.*/
+  ogg_int64_t        bits_per_frame;
+  /*The current buffer fullness (bits available to be used).*/
+  ogg_int64_t        fullness;
+  /*The target buffer fullness.
+    This is where we'd like to be by the last keyframe the appears in the next
+     buf_delay frames.*/
+  ogg_int64_t        target;
+  /*The maximum buffer fullness (total size of the buffer).*/
+  ogg_int64_t        max;
+  /*The log of the number of pixels in a frame in Q57 format.*/
+  ogg_int64_t        log_npixels;
+  /*The exponent used in the rate model in Q8 format.*/
+  unsigned           exp[2];
+  /*The number of frames to distribute the buffer usage over.*/
+  int                buf_delay;
+  /*The total drop count from the previous frame.
+    This includes duplicates explicitly requested via the
+     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
+  ogg_uint32_t       prev_drop_count;
+  /*The log of an estimated scale factor used to obtain the real framerate, for
+     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
+  ogg_int64_t        log_drop_scale;
+  /*The log of estimated scale factor for the rate model in Q57 format.*/
+  ogg_int64_t        log_scale[2];
+  /*The log of the target quantizer level in Q57 format.*/
+  ogg_int64_t        log_qtarget;
+  /*Will we drop frames to meet bitrate target?*/
+  unsigned char      drop_frames;
+  /*Do we respect the maximum buffer fullness?*/
+  unsigned char      cap_overflow;
+  /*Can the reservoir go negative?*/
+  unsigned char      cap_underflow;
+  /*Second-order lowpass filters to track scale and VFR.*/
+  oc_iir_filter      scalefilter[2];
+  int                inter_count;
+  int                inter_delay;
+  int                inter_delay_target;
+  oc_iir_filter      vfrfilter;
+  /*Two-pass mode state.
+    0 => 1-pass encoding.
+    1 => 1st pass of 2-pass encoding.
+    2 => 2nd pass of 2-pass encoding.*/
+  int                twopass;
+  /*Buffer for current frame metrics.*/
+  unsigned char      twopass_buffer[48];
+  /*The number of bytes in the frame metrics buffer.
+    When 2-pass encoding is enabled, this is set to 0 after each frame is
+     submitted, and must be non-zero before the next frame will be accepted.*/
+  int                twopass_buffer_bytes;
+  int                twopass_buffer_fill;
+  /*Whether or not to force the next frame to be a keyframe.*/
+  unsigned char      twopass_force_kf;
+  /*The metrics for the previous frame.*/
+  oc_frame_metrics   prev_metrics;
+  /*The metrics for the current frame.*/
+  oc_frame_metrics   cur_metrics;
+  /*The buffered metrics for future frames.*/
+  oc_frame_metrics  *frame_metrics;
+  int                nframe_metrics;
+  int                cframe_metrics;
+  /*The index of the current frame in the circular metric buffer.*/
+  int                frame_metrics_head;
+  /*The frame count of each type (keyframes, delta frames, and dup frames);
+     32 bits limits us to 2.268 years at 60 fps.*/
+  ogg_uint32_t       frames_total[3];
+  /*The number of frames of each type yet to be processed.*/
+  ogg_uint32_t       frames_left[3];
+  /*The sum of the scale values for each frame type.*/
+  ogg_int64_t        scale_sum[2];
+  /*The start of the window over which the current scale sums are taken.*/
+  int                scale_window0;
+  /*The end of the window over which the current scale sums are taken.*/
+  int                scale_window_end;
+  /*The frame count of each type in the current 2-pass window; this does not
+     include dup frames.*/
+  int                nframes[3];
+  /*The total accumulated estimation bias.*/
+  ogg_int64_t        rate_bias;
+};
+
+
+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
+void oc_rc_state_clear(oc_rc_state *_rc);
+
+void oc_enc_rc_resize(oc_enc_ctx *_enc);
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
+int oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial,int _droppable);
+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
+
+
+
+/*The internal encoder state.*/
+struct th_enc_ctx{
+  /*Shared encoder/decoder state.*/
+  oc_theora_state          state;
+  /*Buffer in which to assemble packets.*/
+  oggpack_buffer           opb;
+  /*Encoder-specific macroblock information.*/
+  oc_mb_enc_info          *mb_info;
+  /*DC coefficients after prediction.*/
+  ogg_int16_t             *frag_dc;
+  /*The list of coded macro blocks, in coded order.*/
+  unsigned                *coded_mbis;
+  /*The number of coded macro blocks.*/
+  size_t                   ncoded_mbis;
+  /*Whether or not packets are ready to be emitted.
+    This takes on negative values while there are remaining header packets to
+     be emitted, reaches 0 when the codec is ready for input, and becomes
+     positive when a frame has been processed and data packets are ready.*/
+  int                      packet_state;
+  /*The maximum distance between keyframes.*/
+  ogg_uint32_t             keyframe_frequency_force;
+  /*The number of duplicates to produce for the next frame.*/
+  ogg_uint32_t             dup_count;
+  /*The number of duplicates remaining to be emitted for the current frame.*/
+  ogg_uint32_t             nqueued_dups;
+  /*The number of duplicates emitted for the last frame.*/
+  ogg_uint32_t             prev_dup_count;
+  /*The current speed level.*/
+  int                      sp_level;
+  /*Whether or not VP3 compatibility mode has been enabled.*/
+  unsigned char            vp3_compatible;
+  /*Whether or not any INTER frames have been coded.*/
+  unsigned char            coded_inter_frame;
+  /*Whether or not previous frame was dropped.*/
+  unsigned char            prevframe_dropped;
+  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
+     coefficients, and luma and chroma tokens.
+    The actual Huffman table used for a given coefficient depends not only on
+     the choice made here, but also its index in the zig-zag ordering.*/
+  unsigned char            huff_idxs[2][2][2];
+  /*Current count of bits used by each MV coding mode.*/
+  size_t                   mv_bits[2];
+  /*The mode scheme chooser for estimating mode coding costs.*/
+  oc_mode_scheme_chooser   chooser;
+  /*Temporary encoder state for the analysis pipeline.*/
+  oc_enc_pipeline_state    pipe;
+  /*The number of vertical super blocks in an MCU.*/
+  int                      mcu_nvsbs;
+  /*The SSD error for skipping each fragment in the current MCU.*/
+  unsigned                *mcu_skip_ssd;
+  /*The masking scale factors for chroma blocks in the current MCU.*/
+  ogg_uint16_t            *mcu_rd_scale;
+  ogg_uint16_t            *mcu_rd_iscale;
+  /*The DCT token lists for each coefficient and each plane.*/
+  unsigned char          **dct_tokens[3];
+  /*The extra bits associated with each DCT token.*/
+  ogg_uint16_t           **extra_bits[3];
+  /*The number of DCT tokens for each coefficient for each plane.*/
+  ptrdiff_t                ndct_tokens[3][64];
+  /*Pending EOB runs for each coefficient for each plane.*/
+  ogg_uint16_t             eob_run[3][64];
+  /*The offset of the first DCT token for each coefficient for each plane.*/
+  unsigned char            dct_token_offs[3][64];
+  /*The last DC coefficient for each plane and reference frame.*/
+  int                      dc_pred_last[3][4];
+#if defined(OC_COLLECT_METRICS)
+  /*Fragment SAD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_sad;
+  /*Fragment SATD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_satd;
+  /*Fragment SSD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_ssd;
+#endif
+  /*The R-D optimization parameter.*/
+  int                      lambda;
+  /*The average block "activity" of the previous frame.*/
+  unsigned                 activity_avg;
+  /*The average MB luma of the previous frame.*/
+  unsigned                 luma_avg;
+  /*The huffman tables in use.*/
+  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+  /*The quantization parameters in use.*/
+  th_quant_info            qinfo;
+  /*The original DC coefficients saved off from the dequatization tables.*/
+  ogg_uint16_t             dequant_dc[64][3][2];
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t      *dequant[3][3][2];
+  /*Condensed quantization tables.*/
+  void                    *enquant[3][3][2];
+  /*The full set of quantization tables.*/
+  void                    *enquant_tables[64][3][2];
+  /*Storage for the quantization tables.*/
+  unsigned char           *enquant_table_data;
+  /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
+    This is used to parameterize the rate control decisions.
+    They are kept in the log domain to simplify later processing.
+    These are DCT domain quantizers, and so are scaled by an additional factor
+     of 4 from the pixel domain.*/
+  ogg_int64_t              log_qavg[2][64];
+  /*The "average" quantizer futher partitioned by color plane.
+    This is used to parameterize mode decision.
+    These are DCT domain quantizers, and so are scaled by an additional factor
+     of 4 from the pixel domain.*/
+  ogg_int16_t              log_plq[64][3][2];
+  /*The R-D scale factors to apply to chroma blocks for a given frame type
+     (INTRA or INTER) and qi value.
+    The first is the "D" modifier (rd_scale), while the second is the "lambda"
+     modifier (rd_iscale).*/
+  ogg_uint16_t             chroma_rd_scale[2][64][2];
+  /*The interpolated mode decision R-D lookup tables for the current
+     quantizers, color plane, and quantization type.*/
+  oc_mode_rd               mode_rd[3][3][2][OC_COMP_BINS];
+  /*The buffer state used to drive rate control.*/
+  oc_rc_state              rc;
+# if defined(OC_ENC_USE_VTABLE)
+  /*Table for encoder acceleration functions.*/
+  oc_enc_opt_vtable        opt_vtable;
+# endif
+  /*Table for encoder data used by accelerated functions.*/
+  oc_enc_opt_data          opt_data;
+};
+
+
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
+
+
+
+/*Perform fullpel motion search for a single MB against both reference frames.*/
+void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
+/*Refine a MB MV for one frame.*/
+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
+/*Refine the block MVs.*/
+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
+
+
+
+/*Used to rollback a tokenlog transaction when we retroactively decide to skip
+   a fragment.
+  A checkpoint is taken right before each token is added.*/
+struct oc_token_checkpoint{
+  /*The color plane the token was added to.*/
+  unsigned char pli;
+  /*The zig-zag index the token was added to.*/
+  unsigned char zzi;
+  /*The outstanding EOB run count before the token was added.*/
+  ogg_uint16_t  eob_run;
+  /*The token count before the token was added.*/
+  ptrdiff_t     ndct_tokens;
+};
+
+
+
+void oc_enc_tokenize_start(oc_enc_ctx *_enc);
+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
+ const oc_token_checkpoint *_stack,int _n);
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
+ int _pli,int _fragy0,int _frag_yend);
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1);
+void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
+
+
+
+/*Utility routine to encode one of the header packets.*/
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op);
+
+
+
+/*Default pure-C implementations of encoder-specific accelerated functions.*/
+void oc_enc_accel_init_c(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_enquant_table_init_c(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif

+ 1836 - 0
modules/theoraplayer/native/theora/lib/encode.c

@@ -0,0 +1,1836 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encode.c 17821 2011-02-09 22:08:34Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+#include "dequant.h"
+
+
+
+/*The default quantization parameters used by VP3.1.*/
+static const int OC_VP31_RANGE_SIZES[1]={63};
+static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  },
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTRA_C[2]={
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTER[2]={
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  },
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  }
+};
+
+const th_quant_info TH_VP31_QUANT_INFO={
+  {
+    220,200,190,180,170,170,160,160,
+    150,150,140,140,130,130,120,120,
+    110,110,100,100, 90, 90, 90, 80,
+     80, 80, 70, 70, 70, 60, 60, 60,
+     60, 50, 50, 50, 50, 40, 40, 40,
+     40, 40, 30, 30, 30, 30, 30, 30,
+     30, 20, 20, 20, 20, 20, 20, 20,
+     20, 10, 10, 10, 10, 10, 10, 10
+  },
+  {
+    500,450,400,370,340,310,285,265,
+    245,225,210,195,185,180,170,160,
+    150,145,135,130,125,115,110,107,
+    100, 96, 93, 89, 85, 82, 75, 74,
+     70, 68, 64, 60, 57, 56, 52, 50,
+     49, 45, 44, 43, 40, 38, 37, 35,
+     33, 32, 30, 29, 28, 25, 24, 22,
+     21, 19, 18, 17, 15, 13, 12, 10
+  },
+  {
+    30,25,20,20,15,15,14,14,
+    13,13,12,12,11,11,10,10,
+     9, 9, 8, 8, 7, 7, 7, 7,
+     6, 6, 6, 6, 5, 5, 5, 5,
+     4, 4, 4, 4, 3, 3, 3, 3,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0
+  },
+  {
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_Y},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C}
+    },
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER}
+    }
+  }
+};
+
+/*The current default quantization parameters.*/
+static const int OC_DEF_QRANGE_SIZES[3]={32,16,15};
+static const th_quant_base OC_DEF_BASES_INTRA_Y[4]={
+  {
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+  },
+  {
+     15, 12, 12, 15, 18, 20, 20, 21,
+     13, 13, 14, 17, 18, 21, 21, 20,
+     14, 14, 15, 18, 20, 21, 21, 21,
+     14, 16, 17, 19, 20, 21, 21, 21,
+     16, 17, 20, 21, 21, 21, 21, 21,
+     18, 19, 20, 21, 21, 21, 21, 21,
+     20, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21
+  },
+  {
+     16, 12, 11, 16, 20, 25, 27, 28,
+     13, 13, 14, 18, 21, 28, 28, 27,
+     14, 13, 16, 20, 25, 28, 28, 28,
+     14, 16, 19, 22, 27, 29, 29, 28,
+     17, 19, 25, 28, 28, 30, 30, 29,
+     20, 24, 27, 28, 29, 30, 30, 29,
+     27, 28, 29, 29, 30, 30, 30, 30,
+     29, 29, 29, 29, 30, 30, 30, 29
+  },
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  }
+};
+static const th_quant_base OC_DEF_BASES_INTRA_C[4]={
+  {
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19
+  },
+  {
+     18, 18, 21, 25, 26, 26, 26, 26,
+     18, 20, 22, 26, 26, 26, 26, 26,
+     21, 22, 25, 26, 26, 26, 26, 26,
+     25, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26
+  },
+  {
+     17, 18, 22, 31, 36, 36, 36, 36,
+     18, 20, 24, 34, 36, 36, 36, 36,
+     22, 24, 33, 36, 36, 36, 36, 36,
+     31, 34, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36
+  },
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  }
+};
+static const th_quant_base OC_DEF_BASES_INTER[4]={
+  {
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21
+  },
+  {
+     18, 18, 18, 21, 23, 24, 25, 27,
+     18, 18, 21, 23, 24, 25, 27, 28,
+     18, 21, 23, 24, 25, 27, 28, 29,
+     21, 23, 24, 25, 27, 28, 29, 29,
+     23, 24, 25, 27, 28, 29, 29, 29,
+     24, 25, 27, 28, 29, 29, 29, 30,
+     25, 27, 28, 29, 29, 29, 30, 30,
+     27, 28, 29, 29, 29, 30, 30, 30
+  },
+  {
+     17, 17, 17, 20, 23, 26, 28, 32,
+     17, 17, 20, 23, 26, 28, 32, 34,
+     17, 20, 23, 26, 28, 32, 34, 37,
+     20, 23, 26, 28, 32, 34, 37, 37,
+     23, 26, 28, 32, 34, 37, 37, 37,
+     26, 28, 32, 34, 37, 37, 37, 41,
+     28, 32, 34, 37, 37, 37, 41, 42,
+     32, 34, 37, 37, 37, 41, 42, 42
+  },
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  }
+};
+
+const th_quant_info TH_DEF_QUANT_INFO={
+  {
+    365,348,333,316,300,287,277,265,
+    252,240,229,219,206,197,189,180,
+    171,168,160,153,146,139,132,127,
+    121,115,110,107,101, 97, 94, 89,
+     85, 83, 78, 73, 72, 67, 66, 62,
+     60, 59, 56, 53, 52, 48, 47, 43,
+     42, 40, 36, 35, 34, 33, 31, 30,
+     28, 25, 24, 22, 20, 17, 14, 10
+  },
+  {
+    365,348,333,316,300,287,277,265,
+    252,240,229,219,206,197,189,180,
+    171,168,160,153,146,139,132,127,
+    121,115,110,107,101, 97, 94, 89,
+     85, 83, 78, 73, 72, 67, 66, 62,
+     60, 59, 56, 53, 52, 48, 47, 43,
+     42, 40, 36, 35, 34, 33, 31, 30,
+     28, 25, 24, 22, 20, 17, 14, 10
+  },
+  {
+    15,12, 9, 8, 6, 6, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5,
+     4, 4, 4, 4, 4, 4, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0
+  },
+  {
+    {
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_Y},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C}
+    },
+    {
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER}
+    }
+  }
+};
+
+
+
+/*The Huffman codes used for macro block modes.*/
+
+const unsigned char OC_MODE_BITS[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {1,2,3,4,5,6,7,7},
+  /*Codebook 1: a fixed-length code.*/
+  {3,3,3,3,3,3,3,3}
+};
+
+static const unsigned char OC_MODE_CODES[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {0x00,0x02,0x06,0x0E,0x1E,0x3E,0x7E,0x7F},
+  /*Codebook 1: a fixed-length code.*/
+  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07}
+};
+
+
+/*The Huffman codes used for motion vectors.*/
+
+const unsigned char OC_MV_BITS[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+      8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,7,7,7,7,7,7,7,7,6,6,6,6,4,4,3,
+    3,
+    3,4,4,6,6,6,6,7,7,7,7,7,7,7,7,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).
+    This wastes a code word (0x01, negative zero), or a bit (0x00, positive
+     zero, requires only 5 bits to uniquely decode), but is hopefully not used
+     very often.*/
+  {
+      6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+  }
+};
+
+static const unsigned char OC_MV_CODES[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+         0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF3,
+    0xF1,0xEF,0xED,0xEB,0xE9,0xE7,0xE5,0xE3,
+    0xE1,0x6F,0x6D,0x6B,0x69,0x67,0x65,0x63,
+    0x61,0x2F,0x2D,0x2B,0x29,0x09,0x07,0x02,
+    0x00,
+    0x01,0x06,0x08,0x28,0x2A,0x2C,0x2E,0x60,
+    0x62,0x64,0x66,0x68,0x6A,0x6C,0x6E,0xE0,
+    0xE2,0xE4,0xE6,0xE8,0xEA,0xEC,0xEE,0xF0,
+    0xF2,0xF4,0xF6,0xF8,0xFA,0xFC,0xFE
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).*/
+  {
+         0x3F,0x3D,0x3B,0x39,0x37,0x35,0x33,
+    0x31,0x2F,0x2D,0x2B,0x29,0x27,0x25,0x23,
+    0x21,0x1F,0x1D,0x1B,0x19,0x17,0x15,0x13,
+    0x11,0x0F,0x0D,0x0B,0x09,0x07,0x05,0x03,
+    0x00,
+    0x02,0x04,0x06,0x08,0x0A,0x0C,0x0E,0x10,
+    0x12,0x14,0x16,0x18,0x1A,0x1C,0x1E,0x20,
+    0x22,0x24,0x26,0x28,0x2A,0x2C,0x2E,0x30,
+    0x32,0x34,0x36,0x38,0x3A,0x3C,0x3E
+  }
+};
+
+
+
+/*Super block run coding scheme:
+   Codeword             Run Length
+   0                       1
+   10x                     2-3
+   110x                    4-5
+   1110xx                  6-9
+   11110xxx                10-17
+   111110xxxx              18-33
+   111111xxxxxxxxxxxx      34-4129*/
+const ogg_uint16_t    OC_SB_RUN_VAL_MIN[8]={1,2,4,6,10,18,34,4130};
+static const unsigned OC_SB_RUN_CODE_PREFIX[7]={
+  0,4,0xC,0x38,0xF0,0x3E0,0x3F000
+};
+const unsigned char   OC_SB_RUN_CODE_NBITS[7]={1,3,4,6,8,10,18};
+
+
+/*Writes the bit pattern for the run length of a super block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run, which must be positive.
+  _flag:      The current flag.
+  _done:      Whether or not more flags are to be encoded.*/
+static void oc_sb_run_pack(oggpack_buffer *_opb,ptrdiff_t _run_count,
+ int _flag,int _done){
+  int i;
+  if(_run_count>=4129){
+    do{
+      oggpackB_write(_opb,0x3FFFF,18);
+      _run_count-=4129;
+      if(_run_count>0)oggpackB_write(_opb,_flag,1);
+      else if(!_done)oggpackB_write(_opb,!_flag,1);
+    }
+    while(_run_count>=4129);
+    if(_run_count<=0)return;
+  }
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  oggpackB_write(_opb,OC_SB_RUN_CODE_PREFIX[i]+_run_count-OC_SB_RUN_VAL_MIN[i],
+   OC_SB_RUN_CODE_NBITS[i]);
+}
+
+
+
+/*Block run coding scheme:
+   Codeword             Run Length
+   0x                      1-2
+   10x                     3-4
+   110x                    5-6
+   1110xx                  7-10
+   11110xx                 11-14
+   11111xxxx               15-30*/
+const unsigned char OC_BLOCK_RUN_CODE_NBITS[30]={
+  2,2,3,3,4,4,6,6,6,6,7,7,7,7,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
+};
+static const ogg_uint16_t  OC_BLOCK_RUN_CODE_PATTERN[30]={
+        0x000,0x001,0x004,0x005,0x00C,0x00D,0x038,
+  0x039,0x03A,0x03B,0x078,0x079,0x07A,0x07B,0x1F0,
+  0x1F1,0x1F2,0x1F3,0x1F4,0x1F5,0x1F6,0x1F7,0x1F8,
+  0x1F9,0x1FA,0x1FB,0x1FC,0x1FD,0x1FE,0x1FF
+};
+
+
+/*Writes the bit pattern for the run length of a block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run.
+              This must be positive, and no more than 30.*/
+static void oc_block_run_pack(oggpack_buffer *_opb,int _run_count){
+  oggpackB_write(_opb,OC_BLOCK_RUN_CODE_PATTERN[_run_count-1],
+   OC_BLOCK_RUN_CODE_NBITS[_run_count-1]);
+}
+
+
+
+static void oc_enc_frame_header_pack(oc_enc_ctx *_enc){
+  /*Mark this as a data packet.*/
+  oggpackB_write(&_enc->opb,0,1);
+  /*Output the frame type (key frame or delta frame).*/
+  oggpackB_write(&_enc->opb,_enc->state.frame_type,1);
+  /*Write out the current qi list.*/
+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
+  if(_enc->state.nqis>1){
+    oggpackB_write(&_enc->opb,1,1);
+    oggpackB_write(&_enc->opb,_enc->state.qis[1],6);
+    if(_enc->state.nqis>2){
+      oggpackB_write(&_enc->opb,1,1);
+      oggpackB_write(&_enc->opb,_enc->state.qis[2],6);
+    }
+    else oggpackB_write(&_enc->opb,0,1);
+  }
+  else oggpackB_write(&_enc->opb,0,1);
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    /*Key frames have 3 unused configuration bits, holdovers from the VP3 days.
+      Most of the other unused bits in the VP3 headers were eliminated.
+      Monty kept these to leave us some wiggle room for future expansion,
+       though a single bit in all frames would have been far more useful.*/
+    oggpackB_write(&_enc->opb,0,3);
+  }
+}
+
+/*Writes the bit flags for whether or not each super block is partially coded
+   or not.
+  These flags are run-length encoded, with the flag value alternating between
+   each run.
+  Return: The number partially coded SBs.*/
+static unsigned oc_enc_partial_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  unsigned           npartial;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  flag=sb_flags[0].coded_partially;
+  oggpackB_write(&_enc->opb,flag,1);
+  sbi=npartial=0;
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially!=flag)break;
+      run_count++;
+      npartial+=flag;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+  return npartial;
+}
+
+/*Writes the coded/not coded flags for each super block that is not partially
+   coded.
+  These flags are run-length encoded, with the flag value altenating between
+   each run.*/
+static void oc_enc_coded_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  /*Skip partially coded super blocks; their flags have already been coded.*/
+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
+  flag=sb_flags[sbi].coded_fully;
+  oggpackB_write(&_enc->opb,flag,1);
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially)continue;
+      if(sb_flags[sbi].coded_fully!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+}
+
+static void oc_enc_coded_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  const oc_fragment *frags;
+  unsigned           npartial;
+  int                run_count;
+  int                flag;
+  int                pli;
+  unsigned           sbi;
+  npartial=oc_enc_partial_sb_flags_pack(_enc);
+  if(npartial<_enc->state.nsbs)oc_enc_coded_sb_flags_pack(_enc);
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  frags=_enc->state.frags;
+  for(sbi=0;sbi<nsbs&&!sb_flags[sbi].coded_partially;sbi++);
+  /*If there's at least one partial SB, store individual coded block flags.*/
+  if(sbi<nsbs){
+    flag=frags[sb_maps[sbi][0][0]].coded;
+    oggpackB_write(&_enc->opb,flag,1);
+    run_count=0;
+    nsbs=sbi=0;
+    for(pli=0;pli<3;pli++){
+      nsbs+=_enc->state.fplanes[pli].nsbs;
+      for(;sbi<nsbs;sbi++){
+        int       quadi;
+        int       bi;
+        ptrdiff_t fragi;
+        if(sb_flags[sbi].coded_partially){
+          for(quadi=0;quadi<4;quadi++){
+            for(bi=0;bi<4;bi++){
+              fragi=sb_maps[sbi][quadi][bi];
+              if(fragi>=0){
+                if(frags[fragi].coded!=flag){
+                  oc_block_run_pack(&_enc->opb,run_count);
+                  flag=!flag;
+                  run_count=1;
+                }
+                else run_count++;
+              }
+            }
+          }
+        }
+      }
+    }
+    /*Flush any trailing block coded run.*/
+    if(run_count>0)oc_block_run_pack(&_enc->opb,run_count);
+  }
+}
+
+static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
+  const unsigned char *mode_codes;
+  const unsigned char *mode_bits;
+  const unsigned char *mode_ranks;
+  unsigned            *coded_mbis;
+  size_t               ncoded_mbis;
+  const signed char   *mb_modes;
+  unsigned             mbii;
+  int                  scheme;
+  int                  mb_mode;
+  scheme=_enc->chooser.scheme_list[0];
+  /*Encode the best scheme.*/
+  oggpackB_write(&_enc->opb,scheme,3);
+  /*If the chosen scheme is scheme 0, send the mode frequency ordering.*/
+  if(scheme==0){
+    for(mb_mode=0;mb_mode<OC_NMODES;mb_mode++){
+      oggpackB_write(&_enc->opb,_enc->chooser.scheme0_ranks[mb_mode],3);
+    }
+  }
+  mode_ranks=_enc->chooser.mode_ranks[scheme];
+  mode_bits=OC_MODE_BITS[scheme+1>>3];
+  mode_codes=OC_MODE_CODES[scheme+1>>3];
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    int rank;
+    rank=mode_ranks[mb_modes[coded_mbis[mbii]]];
+    oggpackB_write(&_enc->opb,mode_codes[rank],mode_bits[rank]);
+  }
+}
+
+static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,oc_mv _mv){
+  int dx;
+  int dy;
+  dx=OC_MV_X(_mv);
+  dy=OC_MV_Y(_mv);
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][dx+31],OC_MV_BITS[_mv_scheme][dx+31]);
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][dy+31],OC_MV_BITS[_mv_scheme][dy+31]);
+}
+
+static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
+  const unsigned     *coded_mbis;
+  size_t              ncoded_mbis;
+  const oc_mb_map    *mb_maps;
+  const signed char  *mb_modes;
+  const oc_fragment  *frags;
+  const oc_mv        *frag_mvs;
+  unsigned            mbii;
+  int                 mv_scheme;
+  /*Choose the coding scheme.*/
+  mv_scheme=_enc->mv_bits[1]<_enc->mv_bits[0];
+  oggpackB_write(&_enc->opb,mv_scheme,1);
+  /*Encode the motion vectors.
+    Macro blocks are iterated in Hilbert scan order, but the MVs within the
+     macro block are coded in raster order.*/
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  frags=_enc->state.frags;
+  frag_mvs=_enc->state.frag_mvs;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    ptrdiff_t fragi;
+    unsigned  mbi;
+    int       bi;
+    mbi=coded_mbis[mbii];
+    switch(mb_modes[mbi]){
+      case OC_MODE_INTER_MV:
+      case OC_MODE_GOLDEN_MV:{
+        for(bi=0;;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
+            /*Only code a single MV for this macro block.*/
+            break;
+          }
+        }
+      }break;
+      case OC_MODE_INTER_MV_FOUR:{
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
+            /*Keep coding all the MVs for this macro block.*/
+          }
+        }
+      }break;
+    }
+  }
+}
+
+static void oc_enc_block_qis_pack(oc_enc_ctx *_enc){
+  const oc_fragment *frags;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  ptrdiff_t          run_count;
+  ptrdiff_t          nqi0;
+  int                flag;
+  if(_enc->state.nqis<=1)return;
+  ncoded_fragis=_enc->state.ntotal_coded_fragis;
+  if(ncoded_fragis<=0)return;
+  coded_fragis=_enc->state.coded_fragis;
+  frags=_enc->state.frags;
+  flag=!!frags[coded_fragis[0]].qii;
+  oggpackB_write(&_enc->opb,flag,1);
+  nqi0=0;
+  for(fragii=0;fragii<ncoded_fragis;){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      if(!!frags[coded_fragis[fragii]].qii!=flag)break;
+      run_count++;
+      nqi0+=!flag;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
+  }
+  if(_enc->state.nqis<3||nqi0>=ncoded_fragis)return;
+  for(fragii=0;!frags[coded_fragis[fragii]].qii;fragii++);
+  flag=frags[coded_fragis[fragii]].qii-1;
+  oggpackB_write(&_enc->opb,flag,1);
+  while(fragii<ncoded_fragis){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      int qii;
+      qii=frags[coded_fragis[fragii]].qii;
+      if(!qii)continue;
+      if(qii-1!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
+  }
+}
+
+/*Counts the tokens of each type used for the given range of coefficient
+   indices in zig-zag order.
+  _zzi_start:      The first zig-zag index to include.
+  _zzi_end:        The first zig-zag index to not include.
+  _token_counts_y: Returns the token counts for the Y' plane.
+  _token_counts_c: Returns the token counts for the Cb and Cr planes.*/
+static void oc_enc_count_tokens(oc_enc_ctx *_enc,int _zzi_start,int _zzi_end,
+ ptrdiff_t _token_counts_y[32],ptrdiff_t _token_counts_c[32]){
+  const unsigned char *dct_tokens;
+  ptrdiff_t            ndct_tokens;
+  int                  pli;
+  int                  zzi;
+  ptrdiff_t            ti;
+  memset(_token_counts_y,0,32*sizeof(*_token_counts_y));
+  memset(_token_counts_c,0,32*sizeof(*_token_counts_c));
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    dct_tokens=_enc->dct_tokens[0][zzi];
+    ndct_tokens=_enc->ndct_tokens[0][zzi];
+    for(ti=_enc->dct_token_offs[0][zzi];ti<ndct_tokens;ti++){
+      _token_counts_y[dct_tokens[ti]]++;
+    }
+  }
+  for(pli=1;pli<3;pli++){
+    for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        _token_counts_c[dct_tokens[ti]]++;
+      }
+    }
+  }
+}
+
+/*Computes the number of bits used for each of the potential Huffman code for
+   the given list of token counts.
+  The bits are added to whatever the current bit counts are.*/
+static void oc_enc_count_bits(oc_enc_ctx *_enc,int _hgi,
+ const ptrdiff_t _token_counts[32],size_t _bit_counts[16]){
+  int huffi;
+  int huff_offs;
+  int token;
+  huff_offs=_hgi<<4;
+  for(huffi=0;huffi<16;huffi++){
+    for(token=0;token<32;token++){
+      _bit_counts[huffi]+=
+       _token_counts[token]*_enc->huff_codes[huffi+huff_offs][token].nbits;
+    }
+  }
+}
+
+/*Returns the Huffman index using the fewest number of bits.*/
+static int oc_select_huff_idx(size_t _bit_counts[16]){
+  int best_huffi;
+  int huffi;
+  best_huffi=0;
+  for(huffi=1;huffi<16;huffi++)if(_bit_counts[huffi]<_bit_counts[best_huffi]){
+    best_huffi=huffi;
+  }
+  return best_huffi;
+}
+
+static void oc_enc_huff_group_pack(oc_enc_ctx *_enc,
+ int _zzi_start,int _zzi_end,const int _huff_idxs[2]){
+  int zzi;
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    int pli;
+    for(pli=0;pli<3;pli++){
+      const unsigned char *dct_tokens;
+      const ogg_uint16_t  *extra_bits;
+      ptrdiff_t            ndct_tokens;
+      const th_huff_code  *huff_codes;
+      ptrdiff_t            ti;
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      extra_bits=_enc->extra_bits[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      huff_codes=_enc->huff_codes[_huff_idxs[pli+1>>1]];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        int token;
+        int neb;
+        token=dct_tokens[ti];
+        oggpackB_write(&_enc->opb,huff_codes[token].pattern,
+         huff_codes[token].nbits);
+        neb=OC_DCT_TOKEN_EXTRA_BITS[token];
+        if(neb)oggpackB_write(&_enc->opb,extra_bits[ti],neb);
+      }
+    }
+  }
+}
+
+static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
+  static const unsigned char  OC_HUFF_GROUP_MIN[6]={0,1,6,15,28,64};
+  static const unsigned char *OC_HUFF_GROUP_MAX=OC_HUFF_GROUP_MIN+1;
+  ptrdiff_t token_counts_y[32];
+  ptrdiff_t token_counts_c[32];
+  size_t    bits_y[16];
+  size_t    bits_c[16];
+  int       huff_idxs[2];
+  int       frame_type;
+  int       hgi;
+  frame_type=_enc->state.frame_type;
+  /*Choose which Huffman tables to use for the DC token list.*/
+  oc_enc_count_tokens(_enc,0,1,token_counts_y,token_counts_c);
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  oc_enc_count_bits(_enc,0,token_counts_y,bits_y);
+  oc_enc_count_bits(_enc,0,token_counts_c,bits_c);
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the DC token list with the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][0][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][0][1]=(unsigned char)huff_idxs[1];
+  oc_enc_huff_group_pack(_enc,0,1,huff_idxs);
+  /*Choose which Huffman tables to use for the AC token lists.*/
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  for(hgi=1;hgi<5;hgi++){
+    oc_enc_count_tokens(_enc,OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],
+     token_counts_y,token_counts_c);
+    oc_enc_count_bits(_enc,hgi,token_counts_y,bits_y);
+    oc_enc_count_bits(_enc,hgi,token_counts_c,bits_c);
+  }
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the AC token lists using the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][1][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][1][1]=(unsigned char)huff_idxs[1];
+  for(hgi=1;hgi<5;hgi++){
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
+    oc_enc_huff_group_pack(_enc,
+     OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],huff_idxs);
+  }
+}
+
+/*Packs an explicit drop frame, instead of using the more efficient 0-byte
+   packet.
+  This is only enabled in VP3-compatibility mode, even though it is not
+   strictly required for VP3 compatibility (VP3 could be encoded in AVI, which
+   also supports dropping frames by inserting 0 byte packets).
+  However, almost every _Theora_ player used to get this wrong (and many still
+   do), and it wasn't until we started shipping a post-VP3 encoder that
+   actually used non-VP3 features that this began to be discovered and fixed,
+   despite being in the standard since 2004.
+  The pack buffer must be reset before calling this function.*/
+static void oc_enc_drop_frame_pack(oc_enc_ctx *_enc){
+  unsigned nsbs;
+  /*Mark this as a data packet.*/
+  oggpackB_write(&_enc->opb,0,1);
+  /*Output the frame type (key frame or delta frame).*/
+  oggpackB_write(&_enc->opb,OC_INTER_FRAME,1);
+  /*Write out the current qi list.
+    We always use just 1 qi, to avoid wasting bits on the others.*/
+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
+  oggpackB_write(&_enc->opb,0,1);
+  /*Coded block flags: everything is uncoded.*/
+  nsbs=_enc->state.nsbs;
+  /*No partially coded SBs.*/
+  oggpackB_write(&_enc->opb,0,1);
+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
+  /*No fully coded SBs.*/
+  oggpackB_write(&_enc->opb,0,1);
+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
+  /*MB modes: just need write which scheme to use.
+    Since we have no coded MBs, we can pick any of them except 0, which would
+     require writing out an additional mode list.*/
+  oggpackB_write(&_enc->opb,7,3);
+  /*MVs: just need write which scheme to use.
+    We can pick either one, since we have no MVs.*/
+  oggpackB_write(&_enc->opb,1,1);
+  /*Write the chosen DC token tables.*/
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][0],4);
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][1],4);
+  /*Write the chosen AC token tables.*/
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][0],4);
+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][1],4);
+}
+
+static void oc_enc_frame_pack(oc_enc_ctx *_enc){
+  oggpackB_reset(&_enc->opb);
+  /*Only proceed if we have some coded blocks.*/
+  if(_enc->state.ntotal_coded_fragis>0){
+    oc_enc_frame_header_pack(_enc);
+    if(_enc->state.frame_type==OC_INTER_FRAME){
+      /*Coded block flags, MB modes, and MVs are only needed for delta frames.*/
+      oc_enc_coded_flags_pack(_enc);
+      oc_enc_mb_modes_pack(_enc);
+      oc_enc_mvs_pack(_enc);
+    }
+    oc_enc_block_qis_pack(_enc);
+    oc_enc_tokenize_finish(_enc);
+    oc_enc_residual_tokens_pack(_enc);
+  }
+  /*If there are no coded blocks, we can drop this frame simply by emitting a
+     0 byte packet.
+    We emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+  else if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
+  /*Success: Mark the packet as ready to be flushed.*/
+  _enc->packet_state=OC_PACKET_READY;
+#if defined(OC_COLLECT_METRICS)
+  oc_enc_mode_metrics_collect(_enc);
+#endif
+}
+
+
+void oc_enc_accel_init_c(oc_enc_ctx *_enc){
+  /*The implementations prefixed with oc_enc_ are encoder-specific.
+    The rest we re-use from the decoder.*/
+# if defined(OC_ENC_USE_VTABLE)
+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
+  _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
+  _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
+  _enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c;
+  _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
+  _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
+  _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
+  _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c;
+  _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c;
+  _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
+  _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
+  _enc->opt_vtable.quantize=oc_enc_quantize_c;
+  _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
+# endif
+  _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
+  _enc->opt_data.enquant_table_alignment=16;
+}
+
+/*Initialize the macro block neighbor lists for MC analysis.
+  This assumes that the entire mb_info memory region has been initialized with
+   zeros.*/
+static void oc_enc_mb_info_init(oc_enc_ctx *_enc){
+  oc_mb_enc_info    *embs;
+  const signed char *mb_modes;
+  unsigned           nhsbs;
+  unsigned           nvsbs;
+  unsigned           nhmbs;
+  unsigned           nvmbs;
+  unsigned           sby;
+  mb_modes=_enc->state.mb_modes;
+  embs=_enc->mb_info;
+  nhsbs=_enc->state.fplanes[0].nhsbs;
+  nvsbs=_enc->state.fplanes[0].nvsbs;
+  nhmbs=_enc->state.nhmbs;
+  nvmbs=_enc->state.nvmbs;
+  for(sby=0;sby<nvsbs;sby++){
+    unsigned sbx;
+    for(sbx=0;sbx<nhsbs;sbx++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++){
+        /*Because of the Hilbert curve ordering the macro blocks are
+           visited in, the available neighbors change depending on where in
+           a super block the macro block is located.
+          Only the first three vectors are used in the median calculation
+           for the optimal predictor, and so the most important should be
+           listed first.
+          Additional vectors are used, so there will always be at least 3,
+           except for in the upper-left most macro block.*/
+        /*The number of current neighbors for each macro block position.*/
+        static const unsigned char NCNEIGHBORS[4]={4,3,2,4};
+        /*The offset of each current neighbor in the X direction.*/
+        static const signed char   CDX[4][4]={
+          {-1,0,1,-1},
+          {-1,0,-1,},
+          {-1,-1},
+          {-1,0,0,1}
+        };
+        /*The offset of each current neighbor in the Y direction.*/
+        static const signed char   CDY[4][4]={
+          {0,-1,-1,-1},
+          {0,-1,-1},
+          {0,-1},
+          {0,-1,1,-1}
+        };
+        /*The offset of each previous neighbor in the X direction.*/
+        static const signed char   PDX[4]={-1,0,1,0};
+        /*The offset of each previous neighbor in the Y direction.*/
+        static const signed char   PDY[4]={0,-1,0,1};
+        unsigned mbi;
+        int      mbx;
+        int      mby;
+        unsigned nmbi;
+        int      nmbx;
+        int      nmby;
+        int      ni;
+        mbi=(sby*nhsbs+sbx<<2)+quadi;
+        if(mb_modes[mbi]==OC_MODE_INVALID)continue;
+        mbx=2*sbx+(quadi>>1);
+        mby=2*sby+(quadi+1>>1&1);
+        /*Fill in the neighbors with current motion vectors available.*/
+        for(ni=0;ni<NCNEIGHBORS[quadi];ni++){
+          nmbx=mbx+CDX[quadi][ni];
+          nmby=mby+CDY[quadi][ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].cneighbors[embs[mbi].ncneighbors++]=nmbi;
+        }
+        /*Fill in the neighbors with previous motion vectors available.*/
+        for(ni=0;ni<4;ni++){
+          nmbx=mbx+PDX[ni];
+          nmby=mby+PDY[ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].pneighbors[embs[mbi].npneighbors++]=nmbi;
+        }
+      }
+    }
+  }
+}
+
+static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int ret;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_codes==NULL)_codes=TH_VP31_HUFF_CODES;
+  /*Validate the codes.*/
+  oggpackB_reset(&_enc->opb);
+  ret=oc_huff_codes_pack(&_enc->opb,_codes);
+  if(ret<0)return ret;
+  memcpy(_enc->huff_codes,_codes,sizeof(_enc->huff_codes));
+  return 0;
+}
+
+static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  unsigned char *etd;
+  size_t         ets;
+  int            align;
+  int            qii;
+  int            qi;
+  int            pli;
+  int            qti;
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+  }
+  /*Initialize the dequantization tables.*/
+  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
+  /*And save off the DC values.*/
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
+  }
+  /*Set up storage for the quantization tables.*/
+  etd=_enc->enquant_table_data;
+  ets=_enc->opt_data.enquant_table_size;
+  align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
+  etd+=align;
+  /*Set up the main tables.*/
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->enquant_tables[qi][pli][qti]=etd;
+    oc_enc_enquant_table_init(_enc,etd,
+     _enc->state.dequant_tables[qi][pli][qti]);
+    etd+=ets;
+  }
+  /*Set up storage for the local copies we modify for each frame.*/
+  for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
+    _enc->enquant[pli][qii][qti]=etd;
+    etd+=ets;
+  }
+}
+
+/*Updates the encoder state after the quantization parameters have been
+   changed.*/
+static void oc_enc_quant_params_updated(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  oc_enc_enquant_tables_init(_enc,_qinfo);
+  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
+   sizeof(_enc->state.loop_filter_limits));
+  oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale,
+   _enc->state.dequant_tables,_enc->state.info.pixel_fmt);
+}
+
+/*Sets the quantization parameters to use.
+  This may only be called before the setup header is written.
+  If it is called multiple times, only the last call has any effect.
+  _qinfo: The quantization parameters.
+          These are described in more detail in theoraenc.h.
+          This can be NULL, in which case the default quantization parameters
+           will be used.*/
+static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  th_quant_info old_qinfo;
+  int           ret;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
+  memcpy(&old_qinfo,&_enc->qinfo,sizeof(old_qinfo));
+  ret=oc_quant_params_clone(&_enc->qinfo,_qinfo);
+  if(ret<0){
+    oc_quant_params_clear(&_enc->qinfo);
+    memcpy(&_enc->qinfo,&old_qinfo,sizeof(old_qinfo));
+    return ret;
+  }
+  else oc_quant_params_clear(&old_qinfo);
+  oc_enc_quant_params_updated(_enc,_qinfo);
+  return 0;
+}
+
+static void oc_enc_clear(oc_enc_ctx *_enc);
+
+static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
+  th_info   info;
+  size_t    mcu_nmbs;
+  ptrdiff_t mcu_ncfrags;
+  ptrdiff_t mcu_nfrags;
+  int       hdec;
+  int       vdec;
+  int       ret;
+  int       pli;
+  /*Clean up the requested settings.*/
+  memcpy(&info,_info,sizeof(info));
+  info.version_major=TH_VERSION_MAJOR;
+  info.version_minor=TH_VERSION_MINOR;
+  info.version_subminor=TH_VERSION_SUB;
+  if(info.quality>63)info.quality=63;
+  if(info.quality<0)info.quality=32;
+  if(info.target_bitrate<0)info.target_bitrate=0;
+  /*Initialize the shared encoder/decoder state.*/
+  ret=oc_state_init(&_enc->state,&info,6);
+  if(ret<0)return ret;
+  oc_enc_accel_init(_enc);
+  _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
+  _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
+  _enc->coded_mbis=
+   (unsigned *)_ogg_malloc(_enc->state.nmbs*sizeof(*_enc->coded_mbis));
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  /*If chroma is sub-sampled in the vertical direction, we have to encode two
+     super block rows of Y' for each super block row of Cb and Cr.*/
+  _enc->mcu_nvsbs=1<<vdec;
+  mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
+  mcu_ncfrags=mcu_nmbs<<3-(hdec+vdec);
+  mcu_nfrags=4*mcu_nmbs+mcu_ncfrags;
+  _enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
+   mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
+  _enc->mcu_rd_scale=(ogg_uint16_t *)_ogg_malloc(
+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_scale));
+  _enc->mcu_rd_iscale=(ogg_uint16_t *)_ogg_malloc(
+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_iscale));
+  for(pli=0;pli<3;pli++){
+    _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
+    _enc->extra_bits[pli]=(ogg_uint16_t **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
+  }
+#if defined(OC_COLLECT_METRICS)
+  _enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad));
+  _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
+  _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
+#endif
+  _enc->enquant_table_data=(unsigned char *)_ogg_malloc(
+   (64+3)*3*2*_enc->opt_data.enquant_table_size
+   +_enc->opt_data.enquant_table_alignment-1);
+  _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
+  _enc->state.qis[0]=_enc->state.info.quality;
+  _enc->state.nqis=1;
+  _enc->activity_avg=90<<12;
+  _enc->luma_avg=128<<8;
+  oc_rc_state_init(&_enc->rc,_enc);
+  oggpackB_writeinit(&_enc->opb);
+  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
+  memset(_enc->qinfo.qi_ranges,0,sizeof(_enc->qinfo.qi_ranges));
+  /*Reset the packet-out state machine.*/
+  _enc->packet_state=OC_PACKET_INFO_HDR;
+  _enc->dup_count=0;
+  _enc->nqueued_dups=0;
+  _enc->prev_dup_count=0;
+  /*Enable speed optimizations up through early skip by default.*/
+  _enc->sp_level=OC_SP_LEVEL_EARLY_SKIP;
+  /*Disable VP3 compatibility by default.*/
+  _enc->vp3_compatible=0;
+  /*No INTER frames coded yet.*/
+  _enc->coded_inter_frame=0;
+  if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL
+   ||_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL
+   ||_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL
+   ||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL
+   ||_enc->extra_bits[2]==NULL
+#if defined(OC_COLLECT_METRICS)
+   ||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
+#endif
+   ||oc_enc_set_quant_params(_enc,NULL)<0){
+    oc_enc_clear(_enc);
+    return TH_EFAULT;
+  }
+  oc_mode_scheme_chooser_init(&_enc->chooser);
+  oc_enc_mb_info_init(_enc);
+  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
+  return 0;
+}
+
+static void oc_enc_clear(oc_enc_ctx *_enc){
+  int pli;
+  oc_rc_state_clear(&_enc->rc);
+  oggpackB_writeclear(&_enc->opb);
+  oc_quant_params_clear(&_enc->qinfo);
+  _ogg_free(_enc->enquant_table_data);
+#if defined(OC_COLLECT_METRICS)
+  /*Save the collected metrics from this run.
+    Use tools/process_modedec_stats to actually generate modedec.h from the
+     resulting file.*/
+  oc_mode_metrics_dump();
+  _ogg_free(_enc->frag_ssd);
+  _ogg_free(_enc->frag_satd);
+  _ogg_free(_enc->frag_sad);
+#endif
+  for(pli=3;pli-->0;){
+    oc_free_2d(_enc->extra_bits[pli]);
+    oc_free_2d(_enc->dct_tokens[pli]);
+  }
+  _ogg_free(_enc->mcu_rd_iscale);
+  _ogg_free(_enc->mcu_rd_scale);
+  _ogg_free(_enc->mcu_skip_ssd);
+  _ogg_free(_enc->coded_mbis);
+  _ogg_free(_enc->frag_dc);
+  _ogg_free(_enc->mb_info);
+  oc_state_clear(&_enc->state);
+}
+
+static void oc_enc_drop_frame(th_enc_ctx *_enc){
+  /*Use the previous frame's reconstruction.*/
+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=
+   _enc->state.ref_frame_idx[OC_FRAME_PREV];
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_data[OC_FRAME_PREV];
+  /*Flag motion vector analysis about the frame drop.*/
+  _enc->prevframe_dropped=1;
+  /*Zero the packet.*/
+  oggpackB_reset(&_enc->opb);
+  /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+  if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
+}
+
+static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTRA_FRAME,
+     _enc->state.curframe_num>0);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
+  oc_enc_analyze_intra(_enc,_recode);
+  oc_enc_frame_pack(_enc);
+  /*On the first frame, the previous call was an initial dry-run to prime
+     feed-forward statistics.*/
+  if(!_recode&&_enc->state.curframe_num==0){
+    if(_enc->state.info.target_bitrate>0){
+      oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+                             OC_INTRA_FRAME,_enc->state.qis[0],1,0);
+    }
+    oc_enc_compress_keyframe(_enc,1);
+  }
+}
+
+static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTER_FRAME,1);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTER_FRAME);
+  if(oc_enc_analyze_inter(_enc,_enc->rc.twopass!=2,_recode)){
+    /*Mode analysis thinks this should have been a keyframe; start over.*/
+    oc_enc_compress_keyframe(_enc,1);
+  }
+  else{
+    oc_enc_frame_pack(_enc);
+    if(!_enc->coded_inter_frame){
+      /*On the first INTER frame, the previous call was an initial dry-run to
+         prime feed-forward statistics.*/
+      _enc->coded_inter_frame=1;
+      if(_enc->state.info.target_bitrate>0){
+        /*Rate control also needs to prime.*/
+        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+         OC_INTER_FRAME,_enc->state.qis[0],1,0);
+      }
+      oc_enc_compress_frame(_enc,1);
+    }
+  }
+}
+
+/*Set the granule position for the next packet to output based on the current
+   internal state.*/
+static void oc_enc_set_granpos(oc_enc_ctx *_enc){
+  unsigned dup_offs;
+  /*Add an offset for the number of duplicate frames we've emitted so far.*/
+  dup_offs=_enc->prev_dup_count-_enc->nqueued_dups;
+  /*If the current frame was a keyframe, use it for the high part.*/
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    _enc->state.granpos=(_enc->state.curframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)+dup_offs;
+  }
+  /*Otherwise use the last keyframe in the high part and put the current frame
+     in the low part.*/
+  else{
+    _enc->state.granpos=
+     (_enc->state.keyframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)
+     +_enc->state.curframe_num-_enc->state.keyframe_num+dup_offs;
+  }
+}
+
+
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  oc_enc_ctx *enc;
+  if(_info==NULL)return NULL;
+  enc=oc_aligned_malloc(sizeof(*enc),16);
+  if(enc==NULL||oc_enc_init(enc,_info)<0){
+    oc_aligned_free(enc);
+    return NULL;
+  }
+  return enc;
+}
+
+void th_encode_free(th_enc_ctx *_enc){
+  if(_enc!=NULL){
+    oc_enc_clear(_enc);
+    oc_aligned_free(_enc);
+  }
+}
+
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
+  switch(_req){
+    case TH_ENCCTL_SET_HUFFMAN_CODES:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_huff_table)*TH_NHUFFMAN_TABLES){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_huffman_codes(_enc,(const th_huff_table *)_buf);
+    }break;
+    case TH_ENCCTL_SET_QUANT_PARAMS:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_quant_info)){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_quant_params(_enc,(th_quant_info *)_buf);
+    }break;
+    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
+      ogg_uint32_t keyframe_frequency_force;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
+      keyframe_frequency_force=*(ogg_uint32_t *)_buf;
+      if(keyframe_frequency_force<=0)keyframe_frequency_force=1;
+      if(_enc->packet_state==OC_PACKET_INFO_HDR){
+        /*It's still early enough to enlarge keyframe_granule_shift.*/
+        _enc->state.info.keyframe_granule_shift=OC_CLAMPI(
+         _enc->state.info.keyframe_granule_shift,
+         OC_ILOG_32(keyframe_frequency_force-1),31);
+      }
+      _enc->keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
+       (ogg_uint32_t)1U<<_enc->state.info.keyframe_granule_shift);
+      *(ogg_uint32_t *)_buf=_enc->keyframe_frequency_force;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_VP3_COMPATIBLE:{
+      int vp3_compatible;
+      int ret;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
+      /*Try this before we change anything else, because it can fail.*/
+      ret=oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO);
+      /*If we can't allocate enough memory, don't change any of the state.*/
+      if(ret==TH_EFAULT)return ret;
+      vp3_compatible=*(int *)_buf;
+      _enc->vp3_compatible=vp3_compatible;
+      if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
+      if(ret<0)vp3_compatible=0;
+      if(_enc->state.info.pixel_fmt!=TH_PF_420||
+       _enc->state.info.pic_width<_enc->state.info.frame_width||
+       _enc->state.info.pic_height<_enc->state.info.frame_height||
+      /*If we have more than 4095 super blocks, VP3's RLE coding might
+         overflow.
+        We could overcome this by ensuring we flip the coded/not-coded flags on
+         at least one super block in the frame, but we pick the simple solution
+         of just telling the user the stream will be incompatible instead.
+        It's unlikely the old VP3 codec would be able to decode streams at this
+         resolution in real time in the first place.*/
+       _enc->state.nsbs>4095){
+        vp3_compatible=0;
+      }
+      *(int *)_buf=vp3_compatible;
+      return 0;
+    }break;
+    case TH_ENCCTL_GET_SPLEVEL_MAX:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
+      *(int *)_buf=OC_SP_LEVEL_MAX;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_SPLEVEL:{
+      int speed;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(speed))return TH_EINVAL;
+      speed=*(int *)_buf;
+      if(speed<0||speed>OC_SP_LEVEL_MAX)return TH_EINVAL;
+      _enc->sp_level=speed;
+      return 0;
+    }break;
+    case TH_ENCCTL_GET_SPLEVEL:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
+      *(int *)_buf=_enc->sp_level;
+      return 0;
+    }
+    case TH_ENCCTL_SET_DUP_COUNT:{
+      int dup_count;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(dup_count))return TH_EINVAL;
+      dup_count=*(int *)_buf;
+      if(dup_count>=_enc->keyframe_frequency_force)return TH_EINVAL;
+      _enc->dup_count=OC_MAXI(dup_count,0);
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_QUALITY:{
+      int qi;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate>0)return TH_EINVAL;
+      qi=*(int *)_buf;
+      if(qi<0||qi>63)return TH_EINVAL;
+      _enc->state.info.quality=qi;
+      _enc->state.qis[0]=(unsigned char)qi;
+      _enc->state.nqis=1;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_BITRATE:{
+      long bitrate;
+      int  reset;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      bitrate=*(long *)_buf;
+      if(bitrate<=0)return TH_EINVAL;
+      reset=_enc->state.info.target_bitrate<=0;
+      _enc->state.info.target_bitrate=bitrate>INT_MAX?INT_MAX:bitrate;
+      if(reset)oc_rc_state_init(&_enc->rc,_enc);
+      else oc_enc_rc_resize(_enc);
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_RATE_FLAGS:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.drop_frames=set&TH_RATECTL_DROP_FRAMES;
+      _enc->rc.cap_overflow=set&TH_RATECTL_CAP_OVERFLOW;
+      _enc->rc.cap_underflow=set&TH_RATECTL_CAP_UNDERFLOW;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_RATE_BUFFER:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.buf_delay=set;
+      oc_enc_rc_resize(_enc);
+      *(int *)_buf=_enc->rc.buf_delay;
+      return 0;
+    }break;
+    case TH_ENCCTL_2PASS_OUT:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=1||
+       _buf_sz!=sizeof(unsigned char *)){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_out(_enc,(unsigned char **)_buf);
+    }break;
+    case TH_ENCCTL_2PASS_IN:{
+      if(_enc==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=2){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
+    }break;
+    case TH_ENCCTL_SET_COMPAT_CONFIG:{
+      unsigned char buf[7];
+      oc_pack_buf   opb;
+      th_quant_info qinfo;
+      th_huff_code  huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+      int           ret;
+      int           i;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+      oc_pack_readinit(&opb,_buf,_buf_sz);
+      /*Validate the setup packet header.*/
+      for(i=0;i<7;i++)buf[i]=(unsigned char)oc_pack_read(&opb,8);
+      if(!(buf[0]&0x80)||memcmp(buf+1,"theora",6)!=0)return TH_ENOTFORMAT;
+      if(buf[0]!=0x82)return TH_EBADHEADER;
+      /*Reads its contents.*/
+      ret=oc_quant_params_unpack(&opb,&qinfo);
+      if(ret<0){
+        oc_quant_params_clear(&qinfo);
+        return ret;
+      }
+      ret=oc_huff_codes_unpack(&opb,huff_codes);
+      if(ret<0){
+        oc_quant_params_clear(&qinfo);
+        return ret;
+      }
+      /*Install the new state.*/
+      oc_quant_params_clear(&_enc->qinfo);
+      memcpy(&_enc->qinfo,&qinfo,sizeof(qinfo));
+      oc_enc_quant_params_updated(_enc,&qinfo);
+      memcpy(_enc->huff_codes,huff_codes,sizeof(_enc->huff_codes));
+      return 0;
+    }
+#if defined(OC_COLLECT_METRICS)
+    case TH_ENCCTL_SET_METRICS_FILE:{
+      OC_MODE_METRICS_FILENAME=(const char *)_buf;
+      return 0;
+    }
+#endif
+    default:return TH_EIMPL;
+  }
+}
+
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
+  if(_enc==NULL)return TH_EFAULT;
+  return oc_state_flushheader(&_enc->state,&_enc->packet_state,&_enc->opb,
+   &_enc->qinfo,(const th_huff_table *)_enc->huff_codes,th_version_string(),
+   _tc,_op);
+}
+
+static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
+ ogg_int32_t _pic_x,ogg_int32_t _pic_y,
+ ogg_int32_t _pic_width,ogg_int32_t _pic_height){
+  unsigned char *dst;
+  int            dstride;
+  ogg_uint32_t   frame_width;
+  ogg_uint32_t   frame_height;
+  ogg_uint32_t   y;
+  frame_width=_dst->width;
+  frame_height=_dst->height;
+  /*If we have _no_ data, just encode a dull green.*/
+  if(_pic_width==0||_pic_height==0){
+    dst=_dst->data;
+    dstride=_dst->stride;
+    for(y=0;y<frame_height;y++){
+      memset(dst,0,frame_width*sizeof(*dst));
+      dst+=dstride;
+    }
+  }
+  /*Otherwise, copy what we do have, and add our own padding.*/
+  else{
+    unsigned char *dst_data;
+    unsigned char *src_data;
+    unsigned char *src;
+    int            sstride;
+    ogg_uint32_t   x;
+    /*Step 1: Copy the data we do have.*/
+    dstride=_dst->stride;
+    sstride=_src->stride;
+    dst_data=_dst->data;
+    src_data=_src->data;
+    dst=dst_data+_pic_y*(ptrdiff_t)dstride+_pic_x;
+    src=src_data+_pic_y*(ptrdiff_t)sstride+_pic_x;
+    for(y=0;y<_pic_height;y++){
+      memcpy(dst,src,_pic_width);
+      dst+=dstride;
+      src+=sstride;
+    }
+    /*Step 2: Perform a low-pass extension into the padding region.*/
+    /*Left side.*/
+    for(x=_pic_x;x-->0;){
+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x;
+      for(y=0;y<_pic_height;y++){
+        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]
+         +(dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Right side.*/
+    for(x=_pic_x+_pic_width;x<frame_width;x++){
+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x-1;
+      for(y=0;y<_pic_height;y++){
+        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]
+         +(dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Top.*/
+    dst=dst_data+_pic_y*(ptrdiff_t)dstride;
+    for(y=_pic_y;y-->0;){
+      for(x=0;x<frame_width;x++){
+        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]
+         +dst[x+(x+1<frame_width)]+2>>2;
+      }
+      dst-=dstride;
+    }
+    /*Bottom.*/
+    dst=dst_data+(_pic_y+_pic_height)*(ptrdiff_t)dstride;
+    for(y=_pic_y+_pic_height;y<frame_height;y++){
+      for(x=0;x<frame_width;x++){
+        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]
+         +(dst-dstride)[x+(x+1<frame_width)]+2>>2;
+      }
+      dst+=dstride;
+    }
+  }
+}
+
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
+  th_ycbcr_buffer img;
+  int             frame_width;
+  int             frame_height;
+  int             pic_width;
+  int             pic_height;
+  int             pic_x;
+  int             pic_y;
+  int             cframe_width;
+  int             cframe_height;
+  int             cpic_width;
+  int             cpic_height;
+  int             cpic_x;
+  int             cpic_y;
+  int             hdec;
+  int             vdec;
+  int             pli;
+  int             refi;
+  int             drop;
+  /*Step 1: validate parameters.*/
+  if(_enc==NULL||_img==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
+  if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  frame_width=_enc->state.info.frame_width;
+  frame_height=_enc->state.info.frame_height;
+  pic_x=_enc->state.info.pic_x;
+  pic_y=_enc->state.info.pic_y;
+  pic_width=_enc->state.info.pic_width;
+  pic_height=_enc->state.info.pic_height;
+  cframe_width=frame_width>>hdec;
+  cframe_height=frame_height>>vdec;
+  cpic_x=pic_x>>hdec;
+  cpic_y=pic_y>>vdec;
+  cpic_width=(pic_x+pic_width+hdec>>hdec)-cpic_x;
+  cpic_height=(pic_y+pic_height+vdec>>vdec)-cpic_y;
+  /*Flip the input buffer upside down.*/
+  oc_ycbcr_buffer_flip(img,_img);
+  if(img[0].width!=frame_width||img[0].height!=frame_height||
+   img[1].width!=cframe_width||img[2].width!=cframe_width||
+   img[1].height!=cframe_height||img[2].height!=cframe_height){
+    /*The buffer does not match the frame size.
+      Check to see if it matches the picture size.*/
+    if(img[0].width!=pic_width||img[0].height!=pic_height||
+     img[1].width!=cpic_width||img[2].width!=cpic_width||
+     img[1].height!=cpic_height||img[2].height!=cpic_height){
+      /*It doesn't; we don't know how to handle it.*/
+      return TH_EINVAL;
+    }
+    /*Adjust the pointers to address a full frame.
+      We still only use the picture region, however.*/
+    img[0].data-=pic_y*(ptrdiff_t)img[0].stride+pic_x;
+    img[1].data-=cpic_y*(ptrdiff_t)img[1].stride+cpic_x;
+    img[2].data-=cpic_y*(ptrdiff_t)img[2].stride+cpic_x;
+  }
+  /*Step 2: Update the buffer state.*/
+  if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
+    _enc->state.ref_frame_idx[OC_FRAME_PREV]=
+     _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    _enc->state.ref_frame_data[OC_FRAME_PREV]=
+     _enc->state.ref_frame_data[OC_FRAME_SELF];
+    if(_enc->state.frame_type==OC_INTRA_FRAME){
+      /*The new frame becomes both the previous and gold reference frames.*/
+      _enc->state.keyframe_num=_enc->state.curframe_num;
+      _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_idx[OC_FRAME_SELF];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_data[OC_FRAME_SELF];
+    }
+  }
+  if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
+    _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
+     _enc->state.ref_frame_idx[OC_FRAME_IO];
+    _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
+     _enc->state.ref_frame_data[OC_FRAME_IO];
+    if(_enc->state.frame_type==OC_INTRA_FRAME){
+      /*The new input frame becomes both the previous and gold
+         original-reference frames.*/
+      _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
+       _enc->state.ref_frame_idx[OC_FRAME_IO];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
+       _enc->state.ref_frame_data[OC_FRAME_IO];
+    }
+  }
+  /*Select a free buffer to use for the incoming frame*/
+  for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
+  _enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_IO]=
+   _enc->state.ref_frame_bufs[refi][0].data;
+  /*Step 3: Copy the input to our internal buffer.
+    This lets us add padding, so we don't have to worry about dereferencing
+     possibly invalid addresses, and allows us to use the same strides and
+     fragment offsets for both the input frame and the reference frames.*/
+  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+0,img+0,
+   pic_x,pic_y,pic_width,pic_height);
+  oc_state_borders_fill_rows(&_enc->state,refi,0,0,frame_height);
+  oc_state_borders_fill_caps(&_enc->state,refi,0);
+  for(pli=1;pli<3;pli++){
+    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+pli,img+pli,
+     cpic_x,cpic_y,cpic_width,cpic_height);
+    oc_state_borders_fill_rows(&_enc->state,refi,pli,0,cframe_height);
+    oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  }
+  /*Select a free buffer to use for the reconstructed version of this frame.*/
+  for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_bufs[refi][0].data;
+  _enc->state.curframe_num+=_enc->prev_dup_count+1;
+  /*Step 4: Compress the frame.*/
+  /*Start with a keyframe, and don't allow the generation of invalid files that
+     overflow the keyframe_granule_shift.*/
+  if(_enc->rc.twopass_force_kf||_enc->state.curframe_num==0||
+   _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
+   _enc->keyframe_frequency_force){
+    oc_enc_compress_keyframe(_enc,0);
+    drop=0;
+  }
+  else{
+    oc_enc_compress_frame(_enc,0);
+    drop=1;
+  }
+  oc_restore_fpu(&_enc->state);
+  /*drop currently indicates if the frame is droppable.*/
+  if(_enc->state.info.target_bitrate>0){
+    drop=oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+     _enc->state.frame_type,_enc->state.qis[0],0,drop);
+  }
+  else drop=0;
+  /*drop now indicates if the frame was dropped.*/
+  if(drop)oc_enc_drop_frame(_enc);
+  else _enc->prevframe_dropped=0;
+  _enc->packet_state=OC_PACKET_READY;
+  _enc->prev_dup_count=_enc->nqueued_dups=_enc->dup_count;
+  _enc->dup_count=0;
+#if defined(OC_DUMP_IMAGES)
+  oc_enc_set_granpos(_enc);
+  oc_state_dump_frame(&_enc->state,OC_FRAME_IO,"src");
+  oc_state_dump_frame(&_enc->state,OC_FRAME_SELF,"rec");
+#endif
+  return 0;
+}
+
+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
+  unsigned char *packet;
+  if(_enc==NULL||_op==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_READY){
+    _enc->packet_state=OC_PACKET_EMPTY;
+    if(_enc->rc.twopass!=1){
+      packet=oggpackB_get_buffer(&_enc->opb);
+      /*If there's no packet, malloc failed while writing; it's lost forever.*/
+      if(packet==NULL)return TH_EFAULT;
+      _op->packet=packet;
+      _op->bytes=oggpackB_bytes(&_enc->opb);
+    }
+    /*For the first pass in 2-pass mode, don't emit any packet data.*/
+    else{
+      _op->packet=NULL;
+      _op->bytes=0;
+    }
+  }
+  else if(_enc->packet_state==OC_PACKET_EMPTY){
+    if(_enc->nqueued_dups>0){
+      _enc->nqueued_dups--;
+      /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
+      if(_enc->vp3_compatible){
+        oggpackB_reset(&_enc->opb);
+        oc_enc_drop_frame_pack(_enc);
+        packet=oggpackB_get_buffer(&_enc->opb);
+        /*If there's no packet, malloc failed while writing; it's lost
+           forever.*/
+        if(packet==NULL)return TH_EFAULT;
+        _op->packet=packet;
+        _op->bytes=oggpackB_bytes(&_enc->opb);
+      }
+      /*Otherwise emit a 0-byte packet.*/
+      else{
+        _op->packet=NULL;
+        _op->bytes=0;
+      }
+    }
+    else{
+      if(_last_p)_enc->packet_state=OC_PACKET_DONE;
+      return 0;
+    }
+  }
+  else return 0;
+  _last_p=_last_p&&_enc->nqueued_dups<=0;
+  _op->b_o_s=0;
+  _op->e_o_s=_last_p;
+  oc_enc_set_granpos(_enc);
+  _op->packetno=th_granule_frame(_enc,_enc->state.granpos)+3;
+  _op->granulepos=_enc->state.granpos;
+  if(_last_p)_enc->packet_state=OC_PACKET_DONE;
+  return 1+_enc->nqueued_dups;
+}

部分文件因文件數量過多而無法顯示